Skip to content

Instantly share code, notes, and snippets.

@simonw
Created June 15, 2021 03:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonw/465f9356f175d1cf86957947dff501d4 to your computer and use it in GitHub Desktop.
Save simonw/465f9356f175d1cf86957947dff501d4 to your computer and use it in GitHub Desktop.
Experiment to guess the types of CSV data
import csv, re
def is_float(s):
try:
float(s)
return True
except ValueError:
return False
def is_int(s):
try:
int(s)
return True
except ValueError:
return False
date_re = re.compile('^\d{4}-\d{2}-\d{2}$')
class Tracker:
def __init__(self):
self.couldbe = {
"integer": is_int,
"float": is_float,
"date": date_re.match
}
def __repr__(self):
return repr(self.couldbe)
def evaluate(self, value):
if not value or not self.couldbe:
return
bads = []
for name, test in self.couldbe.items():
if not test(value):
bads.append(name)
for bad in bads:
del self.couldbe[bad]
lines = ["name,age,weight,dob", "Dog,\"3", "2\",3.5,2001-02-04", "cat,2,1.2,"]
rows = csv.DictReader(lines)
trackers = {}
for row in rows:
for key, value in row.items():
tracker = trackers.setdefault(key, Tracker())
tracker.evaluate(value)
print(trackers)
# {'name': {},
# 'age': {'integer': <function is_int at 0x10a3e1670>, 'float': <function is_float at 0x10a3e11f0>},
# 'weight': {'float': <function is_float at 0x10a3e11f0>},
# 'dob': {'date': <built-in method match of re.Pattern object at 0x1074e5cf0>}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment