New MS Word search script
This commit is contained in:
parent
96ef259b4c
commit
1cfa008b8e
2 changed files with 81 additions and 39 deletions
|
|
@ -1,39 +0,0 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
## Wizard Kit: MS Word content search tool
|
||||
TMP_FILE="$(mktemp)"
|
||||
|
||||
IFS=$'\n'
|
||||
for s in $*; do
|
||||
REGEX="$s"
|
||||
REGEX=$(echo "$REGEX" | sed -r 's/\s+/\\s\*/g')
|
||||
|
||||
# Word Doc
|
||||
for d in *doc; do
|
||||
if antiword "$d" | grep -iqsP "($REGEX)"; then
|
||||
echo "Possible match: $d"
|
||||
echo "$d" >> "$TMP_FILE"
|
||||
fi
|
||||
done
|
||||
|
||||
# Word Docx
|
||||
for d in *docx; do
|
||||
if unzip -p "$d" word/document.xml | grep -iqsP "($REGEX)"; then
|
||||
echo "Possible match: $d"
|
||||
echo "$d" >> "$TMP_FILE"
|
||||
fi
|
||||
done
|
||||
|
||||
done
|
||||
|
||||
# Cleanup results
|
||||
if [[ -s "$TMP_FILE" ]]; then
|
||||
sort -u "$TMP_FILE" >> "$HOME/msword-matches.txt"
|
||||
fi
|
||||
rm "$TMP_FILE"
|
||||
|
||||
# Done
|
||||
if [[ -s "$HOME/msword-matches.txt" ]]; then
|
||||
echo "Found $(wc -l "$HOME/msword-matches.txt") possible matches"
|
||||
echo "The results have been saved to $HOME"
|
||||
fi
|
||||
81
.bin/Scripts/msword-search
Executable file
81
.bin/Scripts/msword-search
Executable file
|
|
@ -0,0 +1,81 @@
|
|||
#!/bin/python3
|
||||
#
|
||||
## Wizard Kit: MS Word content search tool
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
# STATIC VARIABLES
|
||||
SCANDIR = os.getcwd()
|
||||
USAGE = '''Usage: {script} <search-terms>...
|
||||
e.g. {script} "Book Title" "Keyword" "etc"
|
||||
|
||||
This script will search all doc/docx files below the current directory for
|
||||
the search-terms provided (case-insensitive).'''.format(script=__file__)
|
||||
|
||||
# Init
|
||||
os.chdir(os.path.dirname(os.path.realpath(__file__)))
|
||||
sys.path.append(os.getcwd())
|
||||
from functions.network import *
|
||||
init_global_vars()
|
||||
|
||||
REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE)
|
||||
|
||||
def scan_for_docs(path):
|
||||
for entry in os.scandir(path):
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
yield from scantree(entry.path)
|
||||
elif entry.is_file and REGEX_DOC_FILES.search(entry.name):
|
||||
yield entry
|
||||
|
||||
def scan_file(file_path, search):
|
||||
match = False
|
||||
try:
|
||||
if entry.name.lower().endswith('.docx'):
|
||||
result = run_program(['unzip', '-p', entry.path])
|
||||
else:
|
||||
# Assuming .doc
|
||||
result = run_program(['antiword', entry.path])
|
||||
out = result.stdout.decode()
|
||||
match = re.search(search, out, re.IGNORECASE)
|
||||
except Exception:
|
||||
# Ignore errors since files may be corrupted
|
||||
pass
|
||||
|
||||
return entry.path if match else None
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
# Prep
|
||||
clear_screen()
|
||||
terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]]
|
||||
search = '({})'.format('|'.join(terms))
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
# Print usage
|
||||
print_standard(USAGE)
|
||||
else:
|
||||
matches = []
|
||||
for entry in scan_for_docs(SCANDIR):
|
||||
matches.append(scan_file(entry.path, search))
|
||||
# Strip None values (i.e. non-matching entries)
|
||||
matches = [m for m in matches if m]
|
||||
if matches:
|
||||
print_success('Found {} {}:'.format(
|
||||
len(matches),
|
||||
'Matches' if len(matches) > 1 else 'Match'))
|
||||
for match in matches:
|
||||
print_standard(match)
|
||||
else:
|
||||
print_error('No matches found.')
|
||||
|
||||
# Done
|
||||
print_standard('\nDone.')
|
||||
#pause("Press Enter to exit...")
|
||||
exit_script()
|
||||
except SystemExit:
|
||||
pass
|
||||
except:
|
||||
major_exception()
|
||||
|
||||
Loading…
Reference in a new issue