Skip to content

Instantly share code, notes, and snippets.

@jlumbroso
Last active May 19, 2020 15:40
Show Gist options
  • Save jlumbroso/c123a30a2380b58989c7b12fe4b4f49e to your computer and use it in GitHub Desktop.
Save jlumbroso/c123a30a2380b58989c7b12fe4b4f49e to your computer and use it in GitHub Desktop.
Helper code for the `clevercsv` package to help sniff the dialect of very large files.
"""
Helper code for the `clevercsv` package to help sniff the dialect of very large files.
Author: lumbroso@cs.princeton.edu
Date: 2020-05-19
"""
import clevercsv
THRESHOLD_WHOLE_FILE = 10000
THRESHOLD_START = 1000
THRESHOLD_END = 100000
THRESHOLD_COUNT = 4
def probe_sniff(content, verbose=False):
"""
Run the sniffing method with larger truncated versions of the target file,
until there it is likely enough that the detected dialect has been correctly
detected. (Iterate until the dialect has been consistently identified for
`THRESHOLD_COUNT` contiguous occurrences.)
"""
def detect(size):
return clevercsv.Sniffer().sniff(content[0:size], verbose=False)
def trailing_equality(lst, k):
if len(lst) < k:
return False
current_element = lst[-1]
for i in range(0, k):
next_element = lst[-1 - i]
if current_element != next_element:
i -= 1
break
return (i+1) == k
if len(content) < THRESHOLD_WHOLE_FILE:
return detect(len(content))
current_size = THRESHOLD_START
dialects = []
while not trailing_equality(dialects, THRESHOLD_COUNT) and current_size < THRESHOLD_END:
dialects.append(detect(current_size))
current_size = current_size * 2
return dialects[-1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment