Skip to content

Instantly share code, notes, and snippets.

@asm0dey
Last active April 24, 2020 10:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asm0dey/de74f7dc6b99ff6e4fe7ed2eb0ca38fc to your computer and use it in GitHub Desktop.
Save asm0dey/de74f7dc6b99ff6e4fe7ed2eb0ca38fc to your computer and use it in GitHub Desktop.
csv.Sniffer from Python, ported to Kotlin
/**
* BSD 3-Clause License
*
* Copyright (c) [2020], [Pasha Finkelshteyn <asm0dey@jetbrains.com>]
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
import java.io.File
import kotlin.math.min
import kotlin.text.RegexOption.DOT_MATCHES_ALL
import kotlin.text.RegexOption.MULTILINE
/**
* © https://github.com/python/cpython/blob/ee40e4b8563e6e1bc2bfb267da5ffc9a2293318d/Lib/csv.py
* Ported to Kotlin from csv.Python by asm0dey@jetbrains.com
*
* This object tries do identify delimiters and quotes in CSV file. Also it finds if there are spaces which should be
* stripped and quotes which should be escaped.
*
* By standard any ascii character may be delimiter and only " and ' can be quotes
*/
object CsvSniffer {
private val preferred = charArrayOf(',', '\t', ';', ' ', ':')
private val rexes = sequenceOf(
"(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>\\k<delim>",
"(?:^|\n)(?<quote>[\"']).*?\\k<quote>(?<delim>[^\\w\n\"'])(?<space> ?)",
"(?<delim>[^\\w\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>(?:$|\n)",
"(?:^|\n)(?<quote>[\"']).*?\\k<quote>(?:$|\n)"
)
.map { it.toRegex(setOf(DOT_MATCHES_ALL, MULTILINE)) }
fun sniff(file: File): SearchResult? =
guessQuoteAndDelimiter(file)?.takeIf { it.delim.isNotEmpty() } ?: guessDlimiter(file)
/**
*
* Looks for text enclosed between two identical quotes
* (the probable quotechar) which are preceded and followed
* by the same character (the probable delimiter).
* For example:
* ,'some text',
* The quote with the most wins, same with the delimiter.
* If there is no quotechar the delimiter can't be determined
* this way.
*/
fun guessQuoteAndDelimiter(data: File): SearchResult? {
val text = data.readText()
val matches = rexes
.map { it.findAll(text) }
.firstOrNull { !it.none() }
?: return null
val quotes = hashMapOf<String, Int>()
val delims = hashMapOf<String, Int>()
var spaces = 0
for (m in matches) {
var key = m.groups["quote"]!!.value
if (key.isNotEmpty()) {
val newVal = quotes.getOrDefault(key, 0) + 1
quotes[key] = newVal
}
key = m.groups["delim"]?.value ?: continue
if (key.isNotEmpty()) {
val newVal = delims.getOrDefault(key, 0) + 1
delims[key] = newVal
}
if (m.groups["space"]?.value?.isNotEmpty() == true) spaces++
}
val quote = quotes.toList().maxBy { it.second }?.first ?: ""
var delim = delims.toList().maxBy { it.second }?.first
val skipInitialSpace: Boolean
if (delim == null) {
// there is *no* delimiter, it's a single column of quoted data
delim = ""
skipInitialSpace = false
} else {
skipInitialSpace = delims[delim] == spaces
if (delim == "\n") delim = ""
}
/*
* if we see an extra quote between delimiters, we've got a
* double quoted format
*/
val escDelim = Regex.escape(delim)
val escQuote = Regex.escape(quote)
val doubleQuote =
"(($escDelim)|^)\\W*$escQuote[^$escDelim\n]*$escQuote[^$escDelim\n]*$escQuote\\W*(($escDelim)|$)"
.toRegex(MULTILINE)
.find(text) != null
return SearchResult(delim, skipInitialSpace, quote[0], doubleQuote)
}
/**
*
* The delimiter /should/ occur the same number of times on
* each row. However, due to malformed data, it may not. We don't want
* an all or nothing approach, so we allow for small variations in this
* number.
* 1) build a table of the frequency of each character on every line.
* 2) build a table of frequencies of this frequency (meta-frequency?),
* e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
* 7 times in 2 rows'
* 3) use the mode of the meta-frequency to determine the /expected/
* frequency for that character
* 4) find out how often the character actually meets that goal
* 5) the character that best meets its goal is the delimiter
* For performance reasons, the data is evaluated in chunks, so it can
* try and evaluate the smallest portion of the data possible, evaluating
* additional chunks as necessary.
*/
fun guessDlimiter(file: File): SearchResult {
val data = file.readLines().filter { it.isNotBlank() }
val ascii = (0.toChar()..127.toChar())
val chunkLength = min(10, data.size)
var iteration = 0
val charFrequency = hashMapOf<Char, HashMap<Int, Int>>()
val delims = hashMapOf<Char, Pair<Int, Int>>()
val modes = hashMapOf<Char, Pair<Int, Int>>()
var start = 0
var end = chunkLength
while (start < data.size) {
iteration++
for (line in data.slice(start until min(start + chunkLength, data.size))) {
for (char in ascii) {
val metaFrequency = charFrequency.getOrDefault(char, hashMapOf())
val freq = line.count { it == char }
metaFrequency[freq] = metaFrequency.getOrDefault(freq, 0) + 1
charFrequency[char] = metaFrequency
}
}
// get the mode of the frequencies
for (char in charFrequency.keys) {
val items = charFrequency[char]!!.toList().toMutableList()
if (items.size == 1 && items.first().first == 0)
continue
if (items.size > 1) {
modes[char] = items.maxBy { it.second }!!
// adjust the mode - subtract the sum of all other frequencies
items.remove(modes[char])
modes[char] = modes[char]!!.first to (modes[char]!!.second - items.sumBy { it.second })
} else {
modes[char] = items[0]
}
}
val total = min(chunkLength * iteration, data.size).toDouble()
var consistency = 1.0
val threshold = 0.9
while (delims.size == 0 && consistency >= threshold) {
for ((k, v) in modes.entries) {
if (v.first > 0 && v.second > 0) {
if (v.second / total >= consistency) {
delims[k] = v
}
}
}
consistency -= 0.01
}
if (delims.size == 1) {
val delim = delims.keys.first()
val skipinitialspace =
data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
return SearchResult(delim.toString(), skipinitialspace)
}
// analyze another chunkLength lines
start = end
end += chunkLength
}
if (delims.isEmpty()) return SearchResult("", false)
// if there's more than one, fall back to a 'preferred' list
if (delims.size > 1) {
for (delim in preferred) {
if (delims.keys.contains(delim)) {
val skipinitialspace =
data[0].count { it == delim } == data[0].windowedSequence(2).count { it == "$delim " }
return SearchResult(delim.toString(), skipinitialspace)
}
}
}
// nothing else indicates a preference, pick the character that dominates(?)
val delim = delims.entries.maxBy { it.value.first }!!.key.toString()
val skipinitialspace =
data[0].count { it == delim[0] } == data[0].windowedSequence(2).count { it == "$delim " }
return SearchResult(delim, skipinitialspace)
}
}
data class SearchResult(
val delim: String,
val skipInitialSpace: Boolean,
val quoteChar: Char? = null,
val doubleQuote: Boolean? = null
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment