Skip to content

Instantly share code, notes, and snippets.

@alexfriant
Last active August 31, 2022 21:21
Show Gist options
  • Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.
Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.
This Python script will provide a summary of alphanumeric patterns which exist in a list of values
#####################################################################################
#
# Requirements: You'll need Python 3.5.1 or higher to run this
#
# This script will provide you a basic understanding of the alphanumeric patterns
# which exist in a list. You might get this list from a SQL query or something like
# that.
#
# INPUT: Give this script a file that has a single column of ID type strings.
# EXAMPLE (from command line):
# > python patternEyes.py "c:\temp\id_list.txt"
#
# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
# characters to "X". All punctuation stays as it exists.
#
# For example, if you want to see if all records are phone numbers, you might expect
# to see something like this:
# (###)-###-####
# But if you also see something like this, you know the data isn't as "clean" as
# you were hoping, requiring further investigation:
# ##-XXX-######
#
#####################################################################################
import re, os.path, sys
from collections import defaultdict
from pathlib import Path
def patternEyes( filePath = r'c:\temp\id_list.txt'):
strings = []
patterns = []
input_file = filePath
if os.path.isfile( input_file ):
cp = re.compile(r'[,]')
np = re.compile(r'\d')
ap = re.compile(r'[a-z]', re.IGNORECASE)
file = open(input_file, 'r')
for line in file:
strings.extend(line.strip('\n').split(','))
file.close()
for string in strings:
nm = np.sub('#', string)
am = ap.sub('X', nm)
patterns.append(am)
pattern_counts = defaultdict(int)
for pattern in patterns:
if pattern == '':
pattern_counts['No Data'] += 1
else:
pattern_counts[pattern] += 1
pattern_rank = []
for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
pattern_rank.append([k, pattern_counts[k]])
print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
print("\n{0:20} | {1:10}".format("PATTERN", "COUNT"))
print("-"*30)
for pattern, count in pattern_rank:
print("{0:20} | {1:10}".format(pattern, str(count)))
else:
print( "\nSorry, there is no file here: {}".format(input_file))
def main( inputs ):
if len( inputs ) == 2:
patternEyes( inputs[1] )
else:
patternEyes()
if __name__ == "__main__": main( sys.argv )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment