New MS Word search script

2017-12-21 22:31:21 -07:00 · 2017-12-21 22:31:21 -07:00 · 1cfa008b8e
commit 1cfa008b8e
parent 96ef259b4c
2 changed files with 81 additions and 39 deletions
--- a/.bin/Scripts/linux-old/msword-search
+++ b/.bin/Scripts/linux-old/msword-search
@ -1,39 +0,0 @@
-#!/bin/bash
-#
-## Wizard Kit: MS Word content search tool
-TMP_FILE="$(mktemp)"
-
-IFS=$'\n'
-for s in $*; do
-    REGEX="$s"
-    REGEX=$(echo "$REGEX" | sed -r 's/\s+/\\s\*/g')
-    
-    # Word Doc
-    for d in *doc; do
-        if antiword "$d" | grep -iqsP "($REGEX)"; then
-            echo "Possible match: $d"
-            echo "$d" >> "$TMP_FILE"
-        fi
-    done
-    
-    # Word Docx
-    for d in *docx; do
-        if unzip -p "$d" word/document.xml | grep -iqsP "($REGEX)"; then
-            echo "Possible match: $d"
-            echo "$d" >> "$TMP_FILE"
-        fi
-    done
-
-done
-
-# Cleanup results
-if [[ -s "$TMP_FILE" ]]; then
-    sort -u "$TMP_FILE" >> "$HOME/msword-matches.txt"
-fi
-rm "$TMP_FILE"
-
-# Done
-if [[ -s "$HOME/msword-matches.txt" ]]; then
-    echo "Found $(wc -l "$HOME/msword-matches.txt") possible matches"
-    echo "The results have been saved to $HOME"
-fi
--- a/.bin/Scripts/msword-search
+++ b/.bin/Scripts/msword-search
@ -0,0 +1,81 @@
+#!/bin/python3
+#
+## Wizard Kit: MS Word content search tool
+
+import os
+import re
+import sys
+
+# STATIC VARIABLES
+SCANDIR = os.getcwd()
+USAGE = '''Usage: {script} <search-terms>...
+    e.g.  {script} "Book Title" "Keyword" "etc"
+
+    This script will search all doc/docx files below the current directory for
+    the search-terms provided (case-insensitive).'''.format(script=__file__)
+
+# Init
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(os.getcwd())
+from functions.network import *
+init_global_vars()
+
+REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE)
+
+def scan_for_docs(path):
+    for entry in os.scandir(path):
+        if entry.is_dir(follow_symlinks=False):
+            yield from scantree(entry.path)
+        elif entry.is_file and REGEX_DOC_FILES.search(entry.name):
+            yield entry
+
+def scan_file(file_path, search):
+    match = False
+    try:
+        if entry.name.lower().endswith('.docx'):
+            result = run_program(['unzip', '-p', entry.path])
+        else:
+            # Assuming .doc
+            result = run_program(['antiword', entry.path])
+        out = result.stdout.decode()
+        match = re.search(search, out, re.IGNORECASE)
+    except Exception:
+        # Ignore errors since files may be corrupted
+        pass
+    
+    return entry.path if match else None
+
+if __name__ == '__main__':
+    try:
+        # Prep
+        clear_screen()
+        terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]]
+        search = '({})'.format('|'.join(terms))
+
+        if len(sys.argv) == 1:
+            # Print usage
+            print_standard(USAGE)
+        else:
+            matches = []
+            for entry in scan_for_docs(SCANDIR):
+                matches.append(scan_file(entry.path, search))
+            # Strip None values (i.e. non-matching entries)
+            matches = [m for m in matches if m]
+            if matches:
+                print_success('Found {} {}:'.format(
+                    len(matches),
+                    'Matches' if len(matches) > 1 else 'Match'))
+                for match in matches:
+                    print_standard(match)
+            else:
+                print_error('No matches found.')
+        
+        # Done
+        print_standard('\nDone.')
+        #pause("Press Enter to exit...")
+        exit_script()
+    except SystemExit:
+        pass
+    except:
+        major_exception()
+