holgerbrandl/Univocity.kt

## Univocity.kt
package krangl.experimental

import com.univocity.parsers.common.record.Record
import com.univocity.parsers.tsv.TsvParser
import com.univocity.parsers.tsv.TsvParserSettings
import krangl.*
import java.io.Reader

/**
 * @author Holger Brandl
 */

// https://github.com/uniVocity/csv-parsers-comparison#jdk-6-1

fun DataFrame.Companion.readTsvUnivox(
    reader: Reader,
    settings: TsvParserSettings = TsvParserSettings().apply { isHeaderExtractionEnabled = true },
    colTypes: Map<String, ColType> = mapOf()
): DataFrame {

    // from https://www.univocity.com/pages/univocity_parsers_tutorial
    //    val settings = TsvParserSettings()
    //the file used in the example uses '\n' as the line separator sequence.
    //the line separator sequence is defined here to ensure systems such as MacOS and Windows
    //are able to process this file correctly (MacOS uses '\r'; and Windows uses '\r\n').

    //    settings.settings.setLineSeparator("\n")

    // creates a CSV parser
    val parser = TsvParser(settings)


    val formatWithNullString = if (settings.nullValue!=null) {
        settings
    } else {
        settings.nullValue = MISSING_VALUE
        settings

    }

    // parses all rows in one go.
    val allRecords = parser.parseAllRecords(reader)


    val first = allRecords.first()


    val columnNames = first.metaData.headers().toList()
        ?: (1..allRecords[0].values.size).map { index -> "X${index}" }

    // Make column names unique when reading them + unit test
    val uniqueNames = columnNames
        .withIndex()
        .groupBy { it.value }
        .flatMap { (grpName, columns) ->
            columns
                .mapIndexed { index, indexedValue ->
                    indexedValue.index to (grpName + if (index > 2) "_${index + 2}" else "")
                }
        }
        .sortedBy { it.first }.map { it.second }


    //    csvParser.headerMap.keys.pmap{colName ->
    val recordsNoHeader = if(settings.isHeaderExtractionEnabled) allRecords.drop(1) else allRecords

    val cols = uniqueNames.mapIndexed { colIndex, colName ->
        val defaultColType = colTypes[".default"] ?: ColType.Guess

        val colType = colTypes[colName] ?: defaultColType

        dataColFactoryUnivox(colName, colIndex, colType, recordsNoHeader)
    }

    return SimpleDataFrame(cols)
}


internal fun dataColFactoryUnivox(colName: String, colIndex: Int, colType: ColType, records: List<Record>): DataCol =
    when (colType) {
        // see https://github.com/holgerbrandl/krangl/issues/10
        ColType.Int -> try {
            IntCol(colName, records.map { it.getInt(colIndex) })
        } catch (e: NumberFormatException) {
            StringCol(colName, records.map { it.getString(colIndex) })
        }

        ColType.Double -> DoubleCol(colName, records.map { it.getDouble(colIndex)})

        ColType.Boolean -> BooleanCol(colName, records.map { it.getBoolean(colIndex) })

        ColType.String -> StringCol(colName, records.map { it.getString(colIndex) })

        ColType.Guess -> dataColFactoryUnivox(colName, colIndex, guessColType(peekColUniVox(colIndex, records)), records)
    }


internal fun peekColUniVox(colIndex: Int, records: List<Record>, peekSize: Int = 10) = records
    .asSequence()
    .mapIndexed { rowIndex, _ -> records[rowIndex].values[colIndex] }
    .filterNotNull()
    .take(peekSize)
    .toList()
	package krangl.experimental

	import com.univocity.parsers.common.record.Record
	import com.univocity.parsers.tsv.TsvParser
	import com.univocity.parsers.tsv.TsvParserSettings
	import krangl.*
	import java.io.Reader

	/**
	* @author Holger Brandl
	*/

	// https://github.com/uniVocity/csv-parsers-comparison#jdk-6-1

	fun DataFrame.Companion.readTsvUnivox(
	reader: Reader,
	settings: TsvParserSettings = TsvParserSettings().apply { isHeaderExtractionEnabled = true },
	colTypes: Map<String, ColType> = mapOf()
	): DataFrame {

	// from https://www.univocity.com/pages/univocity_parsers_tutorial
	// val settings = TsvParserSettings()
	//the file used in the example uses '\n' as the line separator sequence.
	//the line separator sequence is defined here to ensure systems such as MacOS and Windows
	//are able to process this file correctly (MacOS uses '\r'; and Windows uses '\r\n').

	// settings.settings.setLineSeparator("\n")

	// creates a CSV parser
	val parser = TsvParser(settings)


	val formatWithNullString = if (settings.nullValue!=null) {
	settings
	} else {
	settings.nullValue = MISSING_VALUE
	settings

	}

	// parses all rows in one go.
	val allRecords = parser.parseAllRecords(reader)


	val first = allRecords.first()


	val columnNames = first.metaData.headers().toList()
	?: (1..allRecords[0].values.size).map { index -> "X${index}" }

	// Make column names unique when reading them + unit test
	val uniqueNames = columnNames
	.withIndex()
	.groupBy { it.value }
	.flatMap { (grpName, columns) ->
	columns
	.mapIndexed { index, indexedValue ->
	indexedValue.index to (grpName + if (index > 2) "_${index + 2}" else "")
	}
	}
	.sortedBy { it.first }.map { it.second }


	// csvParser.headerMap.keys.pmap{colName ->
	val recordsNoHeader = if(settings.isHeaderExtractionEnabled) allRecords.drop(1) else allRecords

	val cols = uniqueNames.mapIndexed { colIndex, colName ->
	val defaultColType = colTypes[".default"] ?: ColType.Guess

	val colType = colTypes[colName] ?: defaultColType

	dataColFactoryUnivox(colName, colIndex, colType, recordsNoHeader)
	}

	return SimpleDataFrame(cols)
	}




	internal fun dataColFactoryUnivox(colName: String, colIndex: Int, colType: ColType, records: List<Record>): DataCol =
	when (colType) {
	// see https://github.com/holgerbrandl/krangl/issues/10
	ColType.Int -> try {
	IntCol(colName, records.map { it.getInt(colIndex) })
	} catch (e: NumberFormatException) {
	StringCol(colName, records.map { it.getString(colIndex) })
	}

	ColType.Double -> DoubleCol(colName, records.map { it.getDouble(colIndex)})

	ColType.Boolean -> BooleanCol(colName, records.map { it.getBoolean(colIndex) })

	ColType.String -> StringCol(colName, records.map { it.getString(colIndex) })

	ColType.Guess -> dataColFactoryUnivox(colName, colIndex, guessColType(peekColUniVox(colIndex, records)), records)
	}



	internal fun peekColUniVox(colIndex: Int, records: List<Record>, peekSize: Int = 10) = records
	.asSequence()
	.mapIndexed { rowIndex, _ -> records[rowIndex].values[colIndex] }
	.filterNotNull()
	.take(peekSize)
	.toList()