Last active
May 19, 2020 15:40
-
-
Save jlumbroso/c123a30a2380b58989c7b12fe4b4f49e to your computer and use it in GitHub Desktop.
Helper code for the `clevercsv` package to help sniff the dialect of very large files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Helper code for the `clevercsv` package to help sniff the dialect of very large files. | |
Author: lumbroso@cs.princeton.edu | |
Date: 2020-05-19 | |
""" | |
import clevercsv | |
THRESHOLD_WHOLE_FILE = 10000 | |
THRESHOLD_START = 1000 | |
THRESHOLD_END = 100000 | |
THRESHOLD_COUNT = 4 | |
def probe_sniff(content, verbose=False): | |
""" | |
Run the sniffing method with larger truncated versions of the target file, | |
until there it is likely enough that the detected dialect has been correctly | |
detected. (Iterate until the dialect has been consistently identified for | |
`THRESHOLD_COUNT` contiguous occurrences.) | |
""" | |
def detect(size): | |
return clevercsv.Sniffer().sniff(content[0:size], verbose=False) | |
def trailing_equality(lst, k): | |
if len(lst) < k: | |
return False | |
current_element = lst[-1] | |
for i in range(0, k): | |
next_element = lst[-1 - i] | |
if current_element != next_element: | |
i -= 1 | |
break | |
return (i+1) == k | |
if len(content) < THRESHOLD_WHOLE_FILE: | |
return detect(len(content)) | |
current_size = THRESHOLD_START | |
dialects = [] | |
while not trailing_equality(dialects, THRESHOLD_COUNT) and current_size < THRESHOLD_END: | |
dialects.append(detect(current_size)) | |
current_size = current_size * 2 | |
return dialects[-1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment