jlumbroso/clevercsv_probe_sniff.py

## clevercsv_probe_sniff.py
"""
Helper code for the `clevercsv` package to help sniff the dialect of very large files.

Author: lumbroso@cs.princeton.edu
Date: 2020-05-19
"""

import clevercsv

THRESHOLD_WHOLE_FILE = 10000
THRESHOLD_START = 1000
THRESHOLD_END = 100000
THRESHOLD_COUNT = 4

def probe_sniff(content, verbose=False):
    """
    Run the sniffing method with larger truncated versions of the target file,
    until there it is likely enough that the detected dialect has been correctly
    detected. (Iterate until the dialect has been consistently identified for
    `THRESHOLD_COUNT` contiguous occurrences.)
    """

    def detect(size):
        return clevercsv.Sniffer().sniff(content[0:size], verbose=False)

    def trailing_equality(lst, k):
        if len(lst) < k:
            return False

        current_element = lst[-1]
        for i in range(0, k):
            next_element = lst[-1 - i]
            if current_element != next_element:
                i -= 1
                break

        return (i+1) == k


    if len(content) < THRESHOLD_WHOLE_FILE:
        return detect(len(content))

    current_size = THRESHOLD_START
    dialects = []

    while not trailing_equality(dialects, THRESHOLD_COUNT) and current_size < THRESHOLD_END:
        dialects.append(detect(current_size))
        current_size = current_size * 2

    return dialects[-1]
	"""
	Helper code for the `clevercsv` package to help sniff the dialect of very large files.

	Author: lumbroso@cs.princeton.edu
	Date: 2020-05-19
	"""

	import clevercsv

	THRESHOLD_WHOLE_FILE = 10000
	THRESHOLD_START = 1000
	THRESHOLD_END = 100000
	THRESHOLD_COUNT = 4

	def probe_sniff(content, verbose=False):
	"""
	Run the sniffing method with larger truncated versions of the target file,
	until there it is likely enough that the detected dialect has been correctly
	detected. (Iterate until the dialect has been consistently identified for
	`THRESHOLD_COUNT` contiguous occurrences.)
	"""

	def detect(size):
	return clevercsv.Sniffer().sniff(content[0:size], verbose=False)

	def trailing_equality(lst, k):
	if len(lst) < k:
	return False

	current_element = lst[-1]
	for i in range(0, k):
	next_element = lst[-1 - i]
	if current_element != next_element:
	i -= 1
	break

	return (i+1) == k


	if len(content) < THRESHOLD_WHOLE_FILE:
	return detect(len(content))

	current_size = THRESHOLD_START
	dialects = []

	while not trailing_equality(dialects, THRESHOLD_COUNT) and current_size < THRESHOLD_END:
	dialects.append(detect(current_size))
	current_size = current_size * 2

	return dialects[-1]