Skip to content

Instantly share code, notes, and snippets.

@luketn
Created November 20, 2021 06:08
Show Gist options
  • Save luketn/d29dd425992efff19874d4fa86260343 to your computer and use it in GitHub Desktop.
Save luketn/d29dd425992efff19874d4fa86260343 to your computer and use it in GitHub Desktop.
Convert CSV to Parquet
package com.thing;
import com.google.common.io.Files;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import lombok.extern.slf4j.Slf4j;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
@Slf4j
public class CsvToParquet {
public static void main(String[] args) throws IOException {
// Build the Spark Session
SparkSession spark = SparkSession.builder()
.appName("CSV to Parquet")
.master("local")
//.config('job.local.dir', '/Users/luketn/WriteParquet/data/) \
.getOrCreate();
// Read the CSV file into Data Frame
Dataset<Row> df = spark.read()
.format("csv")
.option("header", "true")
.load("data/weather.csv");
df.show(5);
log.info("The dataframe has " + df.count() + " rows.");
File tempDir = Files.createTempDir();
Path parquetPath = Paths.get(tempDir.getAbsolutePath(), "weather.parquet");
// Write as Parquet
df.write()
.parquet(parquetPath.toString());
log.info("Written Parquet File to \n" + parquetPath);
// Reads a Parquet back into Data Frame
Dataset<Row> pdf = spark.read()
.format("parquet")
.load(parquetPath.toString());
pdf.show(10);
pdf.printSchema();
log.info("The Parquet dataframe has {} rows", pdf.count());
// Now save a JSON
Path jsonPath = Paths.get(tempDir.getAbsolutePath(), "weather.json");
pdf.write()
.json(jsonPath.toString());
log.info("Written file as JSON to\n" + jsonPath);
Desktop.getDesktop().open(tempDir.getAbsoluteFile());
}
}
outlook temperature humidity windy play
sunny 85 85 FALSE no
sunny 80 90 TRUE no
overcast 83 86 FALSE yes
rainy 70 96 FALSE yes
rainy 68 80 FALSE yes
rainy 65 70 TRUE no
overcast 64 65 TRUE yes
sunny 72 95 FALSE no
sunny 69 70 FALSE yes
rainy 75 80 FALSE yes
sunny 75 70 TRUE yes
overcast 72 90 TRUE yes
overcast 81 75 FALSE yes
rainy 71 91 TRUE no
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment