Created
June 15, 2021 03:25
-
-
Save simonw/465f9356f175d1cf86957947dff501d4 to your computer and use it in GitHub Desktop.
Experiment to guess the types of CSV data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv, re | |
def is_float(s): | |
try: | |
float(s) | |
return True | |
except ValueError: | |
return False | |
def is_int(s): | |
try: | |
int(s) | |
return True | |
except ValueError: | |
return False | |
date_re = re.compile('^\d{4}-\d{2}-\d{2}$') | |
class Tracker: | |
def __init__(self): | |
self.couldbe = { | |
"integer": is_int, | |
"float": is_float, | |
"date": date_re.match | |
} | |
def __repr__(self): | |
return repr(self.couldbe) | |
def evaluate(self, value): | |
if not value or not self.couldbe: | |
return | |
bads = [] | |
for name, test in self.couldbe.items(): | |
if not test(value): | |
bads.append(name) | |
for bad in bads: | |
del self.couldbe[bad] | |
lines = ["name,age,weight,dob", "Dog,\"3", "2\",3.5,2001-02-04", "cat,2,1.2,"] | |
rows = csv.DictReader(lines) | |
trackers = {} | |
for row in rows: | |
for key, value in row.items(): | |
tracker = trackers.setdefault(key, Tracker()) | |
tracker.evaluate(value) | |
print(trackers) | |
# {'name': {}, | |
# 'age': {'integer': <function is_int at 0x10a3e1670>, 'float': <function is_float at 0x10a3e11f0>}, | |
# 'weight': {'float': <function is_float at 0x10a3e11f0>}, | |
# 'dob': {'date': <built-in method match of re.Pattern object at 0x1074e5cf0>}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment