diff --git a/.bin/Scripts/linux-old/msword-search b/.bin/Scripts/linux-old/msword-search deleted file mode 100755 index def215b6..00000000 --- a/.bin/Scripts/linux-old/msword-search +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# -## Wizard Kit: MS Word content search tool -TMP_FILE="$(mktemp)" - -IFS=$'\n' -for s in $*; do - REGEX="$s" - REGEX=$(echo "$REGEX" | sed -r 's/\s+/\\s\*/g') - - # Word Doc - for d in *doc; do - if antiword "$d" | grep -iqsP "($REGEX)"; then - echo "Possible match: $d" - echo "$d" >> "$TMP_FILE" - fi - done - - # Word Docx - for d in *docx; do - if unzip -p "$d" word/document.xml | grep -iqsP "($REGEX)"; then - echo "Possible match: $d" - echo "$d" >> "$TMP_FILE" - fi - done - -done - -# Cleanup results -if [[ -s "$TMP_FILE" ]]; then - sort -u "$TMP_FILE" >> "$HOME/msword-matches.txt" -fi -rm "$TMP_FILE" - -# Done -if [[ -s "$HOME/msword-matches.txt" ]]; then - echo "Found $(wc -l "$HOME/msword-matches.txt") possible matches" - echo "The results have been saved to $HOME" -fi diff --git a/.bin/Scripts/msword-search b/.bin/Scripts/msword-search new file mode 100755 index 00000000..3e2c175c --- /dev/null +++ b/.bin/Scripts/msword-search @@ -0,0 +1,81 @@ +#!/bin/python3 +# +## Wizard Kit: MS Word content search tool + +import os +import re +import sys + +# STATIC VARIABLES +SCANDIR = os.getcwd() +USAGE = '''Usage: {script} ... + e.g. {script} "Book Title" "Keyword" "etc" + + This script will search all doc/docx files below the current directory for + the search-terms provided (case-insensitive).'''.format(script=__file__) + +# Init +os.chdir(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(os.getcwd()) +from functions.network import * +init_global_vars() + +REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE) + +def scan_for_docs(path): + for entry in os.scandir(path): + if entry.is_dir(follow_symlinks=False): + yield from scantree(entry.path) + elif entry.is_file and REGEX_DOC_FILES.search(entry.name): + yield entry + +def scan_file(file_path, search): + match = False + try: + if entry.name.lower().endswith('.docx'): + result = run_program(['unzip', '-p', entry.path]) + else: + # Assuming .doc + result = run_program(['antiword', entry.path]) + out = result.stdout.decode() + match = re.search(search, out, re.IGNORECASE) + except Exception: + # Ignore errors since files may be corrupted + pass + + return entry.path if match else None + +if __name__ == '__main__': + try: + # Prep + clear_screen() + terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]] + search = '({})'.format('|'.join(terms)) + + if len(sys.argv) == 1: + # Print usage + print_standard(USAGE) + else: + matches = [] + for entry in scan_for_docs(SCANDIR): + matches.append(scan_file(entry.path, search)) + # Strip None values (i.e. non-matching entries) + matches = [m for m in matches if m] + if matches: + print_success('Found {} {}:'.format( + len(matches), + 'Matches' if len(matches) > 1 else 'Match')) + for match in matches: + print_standard(match) + else: + print_error('No matches found.') + + # Done + print_standard('\nDone.') + #pause("Press Enter to exit...") + exit_script() + except SystemExit: + pass + except: + major_exception() +