Created
September 5, 2018 20:05
-
-
Save holgerbrandl/e85f3b3e73de560be45fefa706d1234c to your computer and use it in GitHub Desktop.
univocity as file reader backend for krangl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package krangl.experimental | |
import com.univocity.parsers.common.record.Record | |
import com.univocity.parsers.tsv.TsvParser | |
import com.univocity.parsers.tsv.TsvParserSettings | |
import krangl.* | |
import java.io.Reader | |
/** | |
* @author Holger Brandl | |
*/ | |
// https://github.com/uniVocity/csv-parsers-comparison#jdk-6-1 | |
fun DataFrame.Companion.readTsvUnivox( | |
reader: Reader, | |
settings: TsvParserSettings = TsvParserSettings().apply { isHeaderExtractionEnabled = true }, | |
colTypes: Map<String, ColType> = mapOf() | |
): DataFrame { | |
// from https://www.univocity.com/pages/univocity_parsers_tutorial | |
// val settings = TsvParserSettings() | |
//the file used in the example uses '\n' as the line separator sequence. | |
//the line separator sequence is defined here to ensure systems such as MacOS and Windows | |
//are able to process this file correctly (MacOS uses '\r'; and Windows uses '\r\n'). | |
// settings.settings.setLineSeparator("\n") | |
// creates a CSV parser | |
val parser = TsvParser(settings) | |
val formatWithNullString = if (settings.nullValue!=null) { | |
settings | |
} else { | |
settings.nullValue = MISSING_VALUE | |
settings | |
} | |
// parses all rows in one go. | |
val allRecords = parser.parseAllRecords(reader) | |
val first = allRecords.first() | |
val columnNames = first.metaData.headers().toList() | |
?: (1..allRecords[0].values.size).map { index -> "X${index}" } | |
// Make column names unique when reading them + unit test | |
val uniqueNames = columnNames | |
.withIndex() | |
.groupBy { it.value } | |
.flatMap { (grpName, columns) -> | |
columns | |
.mapIndexed { index, indexedValue -> | |
indexedValue.index to (grpName + if (index > 2) "_${index + 2}" else "") | |
} | |
} | |
.sortedBy { it.first }.map { it.second } | |
// csvParser.headerMap.keys.pmap{colName -> | |
val recordsNoHeader = if(settings.isHeaderExtractionEnabled) allRecords.drop(1) else allRecords | |
val cols = uniqueNames.mapIndexed { colIndex, colName -> | |
val defaultColType = colTypes[".default"] ?: ColType.Guess | |
val colType = colTypes[colName] ?: defaultColType | |
dataColFactoryUnivox(colName, colIndex, colType, recordsNoHeader) | |
} | |
return SimpleDataFrame(cols) | |
} | |
internal fun dataColFactoryUnivox(colName: String, colIndex: Int, colType: ColType, records: List<Record>): DataCol = | |
when (colType) { | |
// see https://github.com/holgerbrandl/krangl/issues/10 | |
ColType.Int -> try { | |
IntCol(colName, records.map { it.getInt(colIndex) }) | |
} catch (e: NumberFormatException) { | |
StringCol(colName, records.map { it.getString(colIndex) }) | |
} | |
ColType.Double -> DoubleCol(colName, records.map { it.getDouble(colIndex)}) | |
ColType.Boolean -> BooleanCol(colName, records.map { it.getBoolean(colIndex) }) | |
ColType.String -> StringCol(colName, records.map { it.getString(colIndex) }) | |
ColType.Guess -> dataColFactoryUnivox(colName, colIndex, guessColType(peekColUniVox(colIndex, records)), records) | |
} | |
internal fun peekColUniVox(colIndex: Int, records: List<Record>, peekSize: Int = 10) = records | |
.asSequence() | |
.mapIndexed { rowIndex, _ -> records[rowIndex].values[colIndex] } | |
.filterNotNull() | |
.take(peekSize) | |
.toList() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment