Skip to content

Instantly share code, notes, and snippets.

@rickcrawford
Created October 23, 2018 05:08
Show Gist options
  • Save rickcrawford/0b47bb082f869517226e6710982e7cd5 to your computer and use it in GitHub Desktop.
Save rickcrawford/0b47bb082f869517226e6710982e7cd5 to your computer and use it in GitHub Desktop.
Complaint data processing scripts
# The following file downloads complaint data from the Consumer Finance Board.
URL="https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD"
FILE=complaints.csv
if [ ! -f $FILE ]; then
echo "-- Downloading file --"
curl -L $URL --output $FILE
fi
python process.py complaints.csv
for filename in output/*.csv; do
tail -n 1000 $filename >> training_set.csv
done
rm -r output/
import sys
import csv
import re
import md5
import logging
import os
# Classes
OUTPUT_PRODUCTS = []
#OUTPUT_PRODUCTS = [('Credit card',''),('Credit reporting',''),('Debt collection','Credit card'),('Consumer Loan','Vehicle loan')]
class OutputWriter:
def __init__(self, max_values_per_tag=1000, min_values_per_tag=6000, max_values=100000, unique_values=True):
self.files = {}
self.counters = {}
self.values = set()
self.total = 0
self.unique_values = unique_values
self.max_values_per_tag = max_values_per_tag
self.max_values = max_values
self.min_values_per_tag = min_values_per_tag
def _get_writer(self, tag):
if tag not in self.files:
filename = re.sub(r'[^a-z0-9]+', '_', tag.lower()) + ".csv"
f = open("output/" + filename, 'w')
self.files[tag] = f
logging.debug("creating new file: %s" % filename)
return csv.writer(self.files[tag], delimiter=',', quotechar='"')
def write(self, tag, value):
value = value.strip()
tag = tag.strip()
if len(value) == 0 or len(tag) == 0:
return
if self.unique_values:
value_key = md5.new(value).hexdigest()
if value_key in self.values:
return
self.values.add(value_key)
# get count
count = self.counters.get(tag, 0)
writer = self._get_writer(tag)
writer.writerow([value, tag])
count += 1
self.counters[tag] = count
def close(self):
# with open('output/output.csv', 'w') as outfile:
for tag in self.files:
f = self.files[tag]
count = self.counters.get(tag, 0)
logging.debug("closing file: %s, count:%d" % (f.name, count))
if count < self.min_values_per_tag or tag == 'Credit_card_or_prepaid_card':
os.unlink(f.name)
def clean_tag(tag):
return re.sub(r'[^A-Za-z0-9]+', '_', tag.split(',')[0])
def clean_value(value):
return re.sub(r'\s+', ' ', re.sub(r'[\n\r\'\",\/\{\}\[\]]|X{2,}', ' ', value))
###############
# process_csv - process the csv file for complaint data.
#
# The file has the following headers:
# * Date received
# * Product
# * Sub-product
# * Issue
# * Sub-issue
# * Consumer complaint narrative
# * Company public response
# * Company
# * State
# * ZIP code
# * Tags
# * Consumer consent provided?
# * Submitted via
# * Date sent to company
# * Company response to consumer
# * Timely response?
# * Consumer disputed?
# * Complaint ID
def process_csv(incoming):
writers = OutputWriter()
with open(incoming, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
count = 0
for row in reader:
if count > 0 and row[5]:
product = clean_tag(row[1])
sub_product = clean_tag(row[2])
issue = clean_tag(row[3])
tag = product
value = clean_value(row[5])
# check and see if the issue has at least 10 words
words = re.split(r'\s+', value)
if len(words) < 10 or len(words) > 150:
#logging.debug('skipping row: %s' % value)
continue
t = (row[1],row[2])
if len(OUTPUT_PRODUCTS) == 0 or t in OUTPUT_PRODUCTS:
writers.write(tag, value)
count += 1
writers.close()
# main function
def main():
if len(sys.argv) < 2:
print("Usage: process.py [input csv]")
exit(1)
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
incoming = sys.argv[1]
if not os.path.isdir("output"):
os.makedirs("output")
logging.info("Parsing file: %s" % incoming)
process_csv(incoming)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment