asm0dey/CsvSniffer.kt

## CsvSniffer.kt
/**
 * BSD 3-Clause License
 *
 * Copyright (c) [2020], [Pasha Finkelshteyn <asm0dey@jetbrains.com>]
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

import java.io.File
import kotlin.math.min
import kotlin.text.RegexOption.DOT_MATCHES_ALL
import kotlin.text.RegexOption.MULTILINE

/**
 * © https://github.com/python/cpython/blob/ee40e4b8563e6e1bc2bfb267da5ffc9a2293318d/Lib/csv.py
 * Ported to Kotlin from csv.Python by asm0dey@jetbrains.com
 *
 * This object tries do identify delimiters and quotes in CSV file. Also it finds if there are spaces which should be
 * stripped and quotes which should be escaped.
 *
 * By standard any ascii character may be delimiter and only " and ' can be quotes
 */
object CsvSniffer {
    private val preferred = charArrayOf(',', '\t', ';', ' ', ':')
    private val rexes = sequenceOf(
        "(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>\\k<delim>",
        "(?:^|\n)(?<quote>[\"']).*?\\k<quote>(?<delim>[^\\w\n\"'])(?<space> ?)",
        "(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>(?:$|\n)",
        "(?:^|\n)(?<quote>[\"']).*?\\k<quote>(?:$|\n)"
    )
        .map { it.toRegex(setOf(DOT_MATCHES_ALL, MULTILINE)) }

    fun sniff(file: File): SearchResult? =
        guessQuoteAndDelimiter(file)?.takeIf { it.delim.isNotEmpty() } ?: guessDlimiter(file)

    /**
     *
     * Looks for text enclosed between two identical quotes
     * (the probable quotechar) which are preceded and followed
     * by the same character (the probable delimiter).
     * For example:
     * ,'some text',
     * The quote with the most wins, same with the delimiter.
     * If there is no quotechar the delimiter can't be determined
     * this way.
     */
    fun guessQuoteAndDelimiter(data: File): SearchResult? {
        val text = data.readText()
        val matches = rexes
            .map { it.findAll(text) }
            .firstOrNull { !it.none() }
            ?: return null

        val quotes = hashMapOf<String, Int>()
        val delims = hashMapOf<String, Int>()
        var spaces = 0
        for (m in matches) {
            var key = m.groups["quote"]!!.value
            if (key.isNotEmpty()) {
                val newVal = quotes.getOrDefault(key, 0) + 1
                quotes[key] = newVal
            }
            key = m.groups["delim"]?.value ?: continue
            if (key.isNotEmpty()) {
                val newVal = delims.getOrDefault(key, 0) + 1
                delims[key] = newVal
            }

            if (m.groups["space"]?.value?.isNotEmpty() == true) spaces++
        }
        val quote = quotes.toList().maxBy { it.second }?.first ?: ""
        var delim = delims.toList().maxBy { it.second }?.first
        val skipInitialSpace: Boolean
        if (delim == null) {
            // there is *no* delimiter, it's a single column of quoted data
            delim = ""
            skipInitialSpace = false
        } else {
            skipInitialSpace = delims[delim] == spaces
            if (delim == "\n") delim = ""

        }
        /*
        * if we see an extra quote between delimiters, we've got a
        * double quoted format
        */
        val escDelim = Regex.escape(delim)
        val escQuote = Regex.escape(quote)
        val doubleQuote =
            "(($escDelim)|^)\\W*$escQuote[^$escDelim\n]*$escQuote[^$escDelim\n]*$escQuote\\W*(($escDelim)|$)"
                .toRegex(MULTILINE)
                .find(text) != null
        return SearchResult(delim, skipInitialSpace, quote[0], doubleQuote)

    }

    /**
     *
     * The delimiter /should/ occur the same number of times on
     * each row. However, due to malformed data, it may not. We don't want
     * an all or nothing approach, so we allow for small variations in this
     * number.
     * 1) build a table of the frequency of each character on every line.
     * 2) build a table of frequencies of this frequency (meta-frequency?),
     * e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
     * 7 times in 2 rows'
     * 3) use the mode of the meta-frequency to determine the /expected/
     * frequency for that character
     * 4) find out how often the character actually meets that goal
     * 5) the character that best meets its goal is the delimiter
     * For performance reasons, the data is evaluated in chunks, so it can
     * try and evaluate the smallest portion of the data possible, evaluating
     * additional chunks as necessary.
     */
    fun guessDlimiter(file: File): SearchResult {
        val data = file.readLines().filter { it.isNotBlank() }
        val ascii = (0.toChar()..127.toChar())
        val chunkLength = min(10, data.size)
        var iteration = 0
        val charFrequency = hashMapOf<Char, HashMap<Int, Int>>()
        val delims = hashMapOf<Char, Pair<Int, Int>>()
        val modes = hashMapOf<Char, Pair<Int, Int>>()
        var start = 0
        var end = chunkLength
        while (start < data.size) {
            iteration++
            for (line in data.slice(start until min(start + chunkLength, data.size))) {
                for (char in ascii) {
                    val metaFrequency = charFrequency.getOrDefault(char, hashMapOf())
                    val freq = line.count { it == char }
                    metaFrequency[freq] = metaFrequency.getOrDefault(freq, 0) + 1
                    charFrequency[char] = metaFrequency
                }
            }
            // get the mode of the frequencies
            for (char in charFrequency.keys) {
                val items = charFrequency[char]!!.toList().toMutableList()
                if (items.size == 1 && items.first().first == 0)
                    continue
                if (items.size > 1) {
                    modes[char] = items.maxBy { it.second }!!
                    // adjust the mode - subtract the sum of all other frequencies
                    items.remove(modes[char])
                    modes[char] = modes[char]!!.first to (modes[char]!!.second - items.sumBy { it.second })
                } else {
                    modes[char] = items[0]
                }
            }
            val total = min(chunkLength * iteration, data.size).toDouble()
            var consistency = 1.0
            val threshold = 0.9
            while (delims.size == 0 && consistency >= threshold) {
                for ((k, v) in modes.entries) {
                    if (v.first > 0 && v.second > 0) {
                        if (v.second / total >= consistency) {
                            delims[k] = v
                        }
                    }
                }
                consistency -= 0.01
            }

            if (delims.size == 1) {
                val delim = delims.keys.first()
                val skipinitialspace =
                    data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
                return SearchResult(delim.toString(), skipinitialspace)
            }
            // analyze another chunkLength lines
            start = end
            end += chunkLength
        }
        if (delims.isEmpty()) return SearchResult("", false)
        // if there's more than one, fall back to a 'preferred' list
        if (delims.size > 1) {
            for (delim in preferred) {
                if (delims.keys.contains(delim)) {
                    val skipinitialspace =
                        data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
                    return SearchResult(delim.toString(), skipinitialspace)
                }
            }
        }
        // nothing else indicates a preference, pick the character that dominates(?)
        val delim = delims.entries.maxBy { it.value.first }!!.key.toString()
        val skipinitialspace =
            data[0].count { it == delim[0] } == data[0].windowedSequence(2).count { it == "$delim " }
        return SearchResult(delim, skipinitialspace)
    }
}

data class SearchResult(
    val delim: String,
    val skipInitialSpace: Boolean,
    val quoteChar: Char? = null,
    val doubleQuote: Boolean? = null
)
	/**
	* BSD 3-Clause License
	*
	* Copyright (c) [2020], [Pasha Finkelshteyn <asm0dey@jetbrains.com>]
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	*/

	import java.io.File
	import kotlin.math.min
	import kotlin.text.RegexOption.DOT_MATCHES_ALL
	import kotlin.text.RegexOption.MULTILINE

	/**
	* © https://github.com/python/cpython/blob/ee40e4b8563e6e1bc2bfb267da5ffc9a2293318d/Lib/csv.py
	* Ported to Kotlin from csv.Python by asm0dey@jetbrains.com
	*
	* This object tries do identify delimiters and quotes in CSV file. Also it finds if there are spaces which should be
	* stripped and quotes which should be escaped.
	*
	* By standard any ascii character may be delimiter and only " and ' can be quotes
	*/
	object CsvSniffer {
	private val preferred = charArrayOf(',', '\t', ';', ' ', ':')
	private val rexes = sequenceOf(
	"(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>\\k<delim>",
	"(?:^\|\n)(?<quote>[\"']).*?\\k<quote>(?<delim>[^\\w\n\"'])(?<space> ?)",
	"(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>(?:$\|\n)",
	"(?:^\|\n)(?<quote>[\"']).*?\\k<quote>(?:$\|\n)"
	)
	.map { it.toRegex(setOf(DOT_MATCHES_ALL, MULTILINE)) }

	fun sniff(file: File): SearchResult? =
	guessQuoteAndDelimiter(file)?.takeIf { it.delim.isNotEmpty() } ?: guessDlimiter(file)

	/**
	*
	* Looks for text enclosed between two identical quotes
	* (the probable quotechar) which are preceded and followed
	* by the same character (the probable delimiter).
	* For example:
	* ,'some text',
	* The quote with the most wins, same with the delimiter.
	* If there is no quotechar the delimiter can't be determined
	* this way.
	*/
	fun guessQuoteAndDelimiter(data: File): SearchResult? {
	val text = data.readText()
	val matches = rexes
	.map { it.findAll(text) }
	.firstOrNull { !it.none() }
	?: return null

	val quotes = hashMapOf<String, Int>()
	val delims = hashMapOf<String, Int>()
	var spaces = 0
	for (m in matches) {
	var key = m.groups["quote"]!!.value
	if (key.isNotEmpty()) {
	val newVal = quotes.getOrDefault(key, 0) + 1
	quotes[key] = newVal
	}
	key = m.groups["delim"]?.value ?: continue
	if (key.isNotEmpty()) {
	val newVal = delims.getOrDefault(key, 0) + 1
	delims[key] = newVal
	}

	if (m.groups["space"]?.value?.isNotEmpty() == true) spaces++
	}
	val quote = quotes.toList().maxBy { it.second }?.first ?: ""
	var delim = delims.toList().maxBy { it.second }?.first
	val skipInitialSpace: Boolean
	if (delim == null) {
	// there is no delimiter, it's a single column of quoted data
	delim = ""
	skipInitialSpace = false
	} else {
	skipInitialSpace = delims[delim] == spaces
	if (delim == "\n") delim = ""

	}
	/*
	* if we see an extra quote between delimiters, we've got a
	* double quoted format
	*/
	val escDelim = Regex.escape(delim)
	val escQuote = Regex.escape(quote)
	val doubleQuote =
	"(($escDelim)\|^)\\W$escQuote[^$escDelim\n]$escQuote[^$escDelim\n]$escQuote\\W(($escDelim)\|$)"
	.toRegex(MULTILINE)
	.find(text) != null
	return SearchResult(delim, skipInitialSpace, quote[0], doubleQuote)

	}

	/**
	*
	* The delimiter /should/ occur the same number of times on
	* each row. However, due to malformed data, it may not. We don't want
	* an all or nothing approach, so we allow for small variations in this
	* number.
	* 1) build a table of the frequency of each character on every line.
	* 2) build a table of frequencies of this frequency (meta-frequency?),
	* e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
	* 7 times in 2 rows'
	* 3) use the mode of the meta-frequency to determine the /expected/
	* frequency for that character
	* 4) find out how often the character actually meets that goal
	* 5) the character that best meets its goal is the delimiter
	* For performance reasons, the data is evaluated in chunks, so it can
	* try and evaluate the smallest portion of the data possible, evaluating
	* additional chunks as necessary.
	*/
	fun guessDlimiter(file: File): SearchResult {
	val data = file.readLines().filter { it.isNotBlank() }
	val ascii = (0.toChar()..127.toChar())
	val chunkLength = min(10, data.size)
	var iteration = 0
	val charFrequency = hashMapOf<Char, HashMap<Int, Int>>()
	val delims = hashMapOf<Char, Pair<Int, Int>>()
	val modes = hashMapOf<Char, Pair<Int, Int>>()
	var start = 0
	var end = chunkLength
	while (start < data.size) {
	iteration++
	for (line in data.slice(start until min(start + chunkLength, data.size))) {
	for (char in ascii) {
	val metaFrequency = charFrequency.getOrDefault(char, hashMapOf())
	val freq = line.count { it == char }
	metaFrequency[freq] = metaFrequency.getOrDefault(freq, 0) + 1
	charFrequency[char] = metaFrequency
	}
	}
	// get the mode of the frequencies
	for (char in charFrequency.keys) {
	val items = charFrequency[char]!!.toList().toMutableList()
	if (items.size == 1 && items.first().first == 0)
	continue
	if (items.size > 1) {
	modes[char] = items.maxBy { it.second }!!
	// adjust the mode - subtract the sum of all other frequencies
	items.remove(modes[char])
	modes[char] = modes[char]!!.first to (modes[char]!!.second - items.sumBy { it.second })
	} else {
	modes[char] = items[0]
	}
	}
	val total = min(chunkLength * iteration, data.size).toDouble()
	var consistency = 1.0
	val threshold = 0.9
	while (delims.size == 0 && consistency >= threshold) {
	for ((k, v) in modes.entries) {
	if (v.first > 0 && v.second > 0) {
	if (v.second / total >= consistency) {
	delims[k] = v
	}
	}
	}
	consistency -= 0.01
	}

	if (delims.size == 1) {
	val delim = delims.keys.first()
	val skipinitialspace =
	data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
	return SearchResult(delim.toString(), skipinitialspace)
	}
	// analyze another chunkLength lines
	start = end
	end += chunkLength
	}
	if (delims.isEmpty()) return SearchResult("", false)
	// if there's more than one, fall back to a 'preferred' list
	if (delims.size > 1) {
	for (delim in preferred) {
	if (delims.keys.contains(delim)) {
	val skipinitialspace =
	data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
	return SearchResult(delim.toString(), skipinitialspace)
	}
	}
	}
	// nothing else indicates a preference, pick the character that dominates(?)
	val delim = delims.entries.maxBy { it.value.first }!!.key.toString()
	val skipinitialspace =
	data[0].count { it == delim[0] } == data[0].windowedSequence(2).count { it == "$delim " }
	return SearchResult(delim, skipinitialspace)
	}
	}

	data class SearchResult(
	val delim: String,
	val skipInitialSpace: Boolean,
	val quoteChar: Char? = null,
	val doubleQuote: Boolean? = null
	)