Created
October 3, 2016 05:02
-
-
Save csm10495/9551dd8bcd06510cbd868954c920353b to your computer and use it in GitHub Desktop.
Quick and Dirty Recursive File Regex Counter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Brief: | |
This searchs a directory recursively (by default the current directory) for a given regex match. The total number of matches is counted and printed at the end. | |
By default, it looks for lines where the author tag is given but there is no name provided. | |
Author(s): | |
Charles Machalow | |
''' | |
import os, re | |
REGEX = re.compile(r".*Author.*:\n(?:\s*|\s*\'\'\'\s*|\s*\"\"\"\s*)\n") | |
MAX_FILE_SIZE = 1024 * 128 #128 KB | |
def searchFolder(folder): | |
regexMatches = 0 | |
try: | |
for file in os.listdir(folder): | |
if os.path.isfile(file): | |
filePath = os.path.join(os.getcwd(), file) | |
try: | |
print ('File: %s' % filePath) | |
except UnicodeEncodeError: | |
print ('File: %s' % filePath.encode('utf8')) | |
fileLen = len(file) | |
spaceLen = len(filePath) - fileLen | |
if os.stat(file).st_size < MAX_FILE_SIZE: | |
with open(file, 'r') as f: | |
try: | |
# Some files have lots of extra null chars... don't know why... | |
txt = f.read().strip('\0') | |
except UnicodeDecodeError as ex: | |
print (' ' + ' ' * spaceLen + ('^' * fileLen) + ' Skipping File... odd byte found (%s)' % ex.reason) | |
continue | |
except Exception as ex: | |
print (' ' + ' ' * spaceLen + ('^' * fileLen) + ' Skipping File... odd byte found (Generic Exception)') | |
continue | |
regexMatches += len(re.findall(REGEX, txt)) | |
else: | |
print ('Folder: %s' % os.path.join(os.getcwd(), file)) | |
os.chdir(file) | |
regexMatches += searchFolder('.') | |
os.chdir('../') | |
except PermissionError: | |
pass | |
return regexMatches | |
if __name__ == '__main__': | |
print ("\nRegex Matches: %d" % (searchFolder('.'))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment