Skip to content

Instantly share code, notes, and snippets.

@holgerbrandl
Created September 5, 2018 20:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save holgerbrandl/e85f3b3e73de560be45fefa706d1234c to your computer and use it in GitHub Desktop.
Save holgerbrandl/e85f3b3e73de560be45fefa706d1234c to your computer and use it in GitHub Desktop.
univocity as file reader backend for krangl
package krangl.experimental
import com.univocity.parsers.common.record.Record
import com.univocity.parsers.tsv.TsvParser
import com.univocity.parsers.tsv.TsvParserSettings
import krangl.*
import java.io.Reader
/**
* @author Holger Brandl
*/
// https://github.com/uniVocity/csv-parsers-comparison#jdk-6-1
fun DataFrame.Companion.readTsvUnivox(
reader: Reader,
settings: TsvParserSettings = TsvParserSettings().apply { isHeaderExtractionEnabled = true },
colTypes: Map<String, ColType> = mapOf()
): DataFrame {
// from https://www.univocity.com/pages/univocity_parsers_tutorial
// val settings = TsvParserSettings()
//the file used in the example uses '\n' as the line separator sequence.
//the line separator sequence is defined here to ensure systems such as MacOS and Windows
//are able to process this file correctly (MacOS uses '\r'; and Windows uses '\r\n').
// settings.settings.setLineSeparator("\n")
// creates a CSV parser
val parser = TsvParser(settings)
val formatWithNullString = if (settings.nullValue!=null) {
settings
} else {
settings.nullValue = MISSING_VALUE
settings
}
// parses all rows in one go.
val allRecords = parser.parseAllRecords(reader)
val first = allRecords.first()
val columnNames = first.metaData.headers().toList()
?: (1..allRecords[0].values.size).map { index -> "X${index}" }
// Make column names unique when reading them + unit test
val uniqueNames = columnNames
.withIndex()
.groupBy { it.value }
.flatMap { (grpName, columns) ->
columns
.mapIndexed { index, indexedValue ->
indexedValue.index to (grpName + if (index > 2) "_${index + 2}" else "")
}
}
.sortedBy { it.first }.map { it.second }
// csvParser.headerMap.keys.pmap{colName ->
val recordsNoHeader = if(settings.isHeaderExtractionEnabled) allRecords.drop(1) else allRecords
val cols = uniqueNames.mapIndexed { colIndex, colName ->
val defaultColType = colTypes[".default"] ?: ColType.Guess
val colType = colTypes[colName] ?: defaultColType
dataColFactoryUnivox(colName, colIndex, colType, recordsNoHeader)
}
return SimpleDataFrame(cols)
}
internal fun dataColFactoryUnivox(colName: String, colIndex: Int, colType: ColType, records: List<Record>): DataCol =
when (colType) {
// see https://github.com/holgerbrandl/krangl/issues/10
ColType.Int -> try {
IntCol(colName, records.map { it.getInt(colIndex) })
} catch (e: NumberFormatException) {
StringCol(colName, records.map { it.getString(colIndex) })
}
ColType.Double -> DoubleCol(colName, records.map { it.getDouble(colIndex)})
ColType.Boolean -> BooleanCol(colName, records.map { it.getBoolean(colIndex) })
ColType.String -> StringCol(colName, records.map { it.getString(colIndex) })
ColType.Guess -> dataColFactoryUnivox(colName, colIndex, guessColType(peekColUniVox(colIndex, records)), records)
}
internal fun peekColUniVox(colIndex: Int, records: List<Record>, peekSize: Int = 10) = records
.asSequence()
.mapIndexed { rowIndex, _ -> records[rowIndex].values[colIndex] }
.filterNotNull()
.take(peekSize)
.toList()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment