Updated msword-search

This commit is contained in:
2Shirt 2018-12-27 20:30:40 -07:00
parent 4ddce7cfbe
commit 387062074a
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C

View file

@ -9,10 +9,10 @@ import sys
# STATIC VARIABLES # STATIC VARIABLES
SCANDIR = os.getcwd() SCANDIR = os.getcwd()
USAGE = '''Usage: {script} <search-terms>... USAGE = '''Usage: {script} <search-terms>...
e.g. {script} "Book Title" "Keyword" "etc" e.g. {script} "Book Title" "Keyword" "etc"
This script will search all doc/docx files below the current directory for This script will search all doc/docx files below the current directory for
the search-terms provided (case-insensitive).'''.format(script=__file__) the search-terms provided (case-insensitive).'''.format(script=__file__)
# Init # Init
os.chdir(os.path.dirname(os.path.realpath(__file__))) os.chdir(os.path.dirname(os.path.realpath(__file__)))
@ -23,59 +23,60 @@ init_global_vars()
REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE) REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE)
def scan_for_docs(path): def scan_for_docs(path):
for entry in os.scandir(path): for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False): if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path) yield from scan_for_docs(entry.path)
elif entry.is_file and REGEX_DOC_FILES.search(entry.name): elif entry.is_file and REGEX_DOC_FILES.search(entry.name):
yield entry yield entry
def scan_file(file_path, search): def scan_file(file_path, search):
match = False match = False
try: try:
if entry.name.lower().endswith('.docx'): if entry.name.lower().endswith('.docx'):
result = run_program(['unzip', '-p', entry.path]) result = run_program(['unzip', '-p', entry.path])
else: else:
# Assuming .doc # Assuming .doc
result = run_program(['antiword', entry.path]) result = run_program(['antiword', entry.path])
out = result.stdout.decode() out = result.stdout.decode()
match = re.search(search, out, re.IGNORECASE) match = re.search(search, out, re.IGNORECASE)
except Exception: except Exception:
# Ignore errors since files may be corrupted # Ignore errors since files may be corrupted
pass pass
return entry.path if match else None return entry.path if match else None
if __name__ == '__main__': if __name__ == '__main__':
try: try:
# Prep # Prep
clear_screen() clear_screen()
terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]] terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]]
search = '({})'.format('|'.join(terms)) search = '({})'.format('|'.join(terms))
if len(sys.argv) == 1: if len(sys.argv) == 1:
# Print usage # Print usage
print_standard(USAGE) print_standard(USAGE)
else: else:
matches = [] matches = []
for entry in scan_for_docs(SCANDIR): for entry in scan_for_docs(SCANDIR):
matches.append(scan_file(entry.path, search)) matches.append(scan_file(entry.path, search))
# Strip None values (i.e. non-matching entries) # Strip None values (i.e. non-matching entries)
matches = [m for m in matches if m] matches = [m for m in matches if m]
if matches: if matches:
print_success('Found {} {}:'.format( print_success('Found {} {}:'.format(
len(matches), len(matches),
'Matches' if len(matches) > 1 else 'Match')) 'Matches' if len(matches) > 1 else 'Match'))
for match in matches: for match in matches:
print_standard(match) print_standard(match)
else: else:
print_error('No matches found.') print_error('No matches found.')
# Done # Done
print_standard('\nDone.') print_standard('\nDone.')
#pause("Press Enter to exit...") #pause("Press Enter to exit...")
exit_script() exit_script()
except SystemExit: except SystemExit:
pass pass
except: except:
major_exception() major_exception()
# vim: sts=2 sw=2 ts=2