avcaliani/.md

## .md

      
    Raw
  

              .md
            
          
    🥑 Positional File

By Anthony Vilarim Caliani
   
This is an example of writing a single positinal file.

In this example I'm using a Avocado Prices dataset, so thanks to Justin Kiggins for sharing his dataset.
The important thing here is the code, but if you want to execute it there is a run.sh to help you out.
bash -x run.sh
Positional File Contract


Order
Field
Type
Size
Observation


1
Date
string
08
Date Pattern 👉 yyyyMMdd.


2
Average Price
float
12
The last 2 digits are the decimals.


3
Total Volume
int
10


4
Region
string
25
Must be in upper case.


💡 Characters per line = 55.

For extra characters fill:

strings with "whitespace" ( ).
float and int with "zero" (0).

Data x File Example

Input data example.
date: "2020-09-01"
average_price: 11.50
total_volume: 1050
region: "Brazil"
Expected output file.
202009010000000011500000001050                   BRAZIL

Output

wc -l data/avocado.csv 
# 18250 data/avocado.csv

wc -l data/AVOCADO.TXT
# 18249 data/AVOCADO.TXT
# 1 line is missing, but it is the CSV header.

head data/AVOCADO.TXT
# 201501040000000001220004087328                   ALBANY
# 201501040000000001790000137395                   ALBANY
# 201501040000000000100043502149                  ATLANTA
# 201501040000000001760000384669                  ATLANTA
# 201501040000000001080078802506      BALTIMOREWASHINGTON
# 201501040000000001290001913728      BALTIMOREWASHINGTON
# 201501040000000001010008003432                    BOISE
# 201501040000000001640000150512                    BOISE
# 201501040000000001020004917380                   BOSTON
# 201501040000000001830000219213                   BOSTON
Further Help

If you want to execute this code locally you have to download the dataset from Kaggle and then add the file into the folder ./data/.

  
## main.scala
import org.apache.spark.sql.{SparkSession, DataFrame, Column}
import org.apache.spark.sql.functions._
import spark.implicits._


def read(spark: SparkSession, file: String): DataFrame = {
    println(s"Reading file '$file'...")
    spark.read
        .option("header", "true")
        .option("delimiter", ",")
        .csv(file)
}


def save(df: DataFrame, path: String): Unit = {
    println(s"Writing data to '$path'...")
    df.coalesce(1)
        .write
        .mode("overwrite")
        .text(path)
}


def toText(column: Column, size: Int, character: String): Column = {
    lpad(trim(regexp_replace(column, "\\D", "")), size, character)
}


def prepare(df: DataFrame): DataFrame = {
    // Parsing Data
    val parsed = df.orderBy($"Date", $"region")
        .select(
            toText($"Date",              8,  "0"),
            toText($"AveragePrice",      12, "0"),
            toText($"Total Volume",      10, "0"),
            lpad(upper(trim($"region")), 25, " ")
        )
    // To Single Column
    parsed.select(concat(parsed.columns.map(col(_)):_*))
}


//  ______     __  __     __   __
// /\  == \   /\ \/\ \   /\ "-.\ \
// \ \  __<   \ \ \_\ \  \ \ \-.  \
//  \ \_\ \_\  \ \_____\  \ \_\\"\_\
//   \/_/ /_/   \/_____/   \/_/ \/_/

val BUCKET = "./data"

val df = read(spark, s"$BUCKET/avocado.csv")
println(s"This file has ${df.count()} records.")
df.printSchema()

save(prepare(df), s"$BUCKET/avocado")

println("That's all folks o/")
sys.exit(0)

## run.sh
#!/bin/bash -x
# @script       run.sh
# @author       Anthony Vilarim Caliani
# @contact      github.com/avcaliani
#
# @description
# Script to execute Spark Shell jobs...
#
# @usage
# ./run.sh
spark-shell -I main.scala \
    && rm -f data/*.txt data/avocado/*.crc \
    && mv data/avocado/*.txt data/AVOCADO.TXT \
    && rm -rf data/avocado \
    && exit 0
Order	Field	Type	Size	Observation
1	Date	`string`	`08`	Date Pattern 👉 `yyyyMMdd`.
2	Average Price	`float`	`12`	The last 2 digits are the decimals.
3	Total Volume	`int`	`10`
4	Region	`string`	`25`	Must be in upper case.
	import org.apache.spark.sql.{SparkSession, DataFrame, Column}
	import org.apache.spark.sql.functions._
	import spark.implicits._


	def read(spark: SparkSession, file: String): DataFrame = {
	println(s"Reading file '$file'...")
	spark.read
	.option("header", "true")
	.option("delimiter", ",")
	.csv(file)
	}


	def save(df: DataFrame, path: String): Unit = {
	println(s"Writing data to '$path'...")
	df.coalesce(1)
	.write
	.mode("overwrite")
	.text(path)
	}


	def toText(column: Column, size: Int, character: String): Column = {
	lpad(trim(regexp_replace(column, "\\D", "")), size, character)
	}


	def prepare(df: DataFrame): DataFrame = {
	// Parsing Data
	val parsed = df.orderBy($"Date", $"region")
	.select(
	toText($"Date", 8, "0"),
	toText($"AveragePrice", 12, "0"),
	toText($"Total Volume", 10, "0"),
	lpad(upper(trim($"region")), 25, " ")
	)
	// To Single Column
	parsed.select(concat(parsed.columns.map(col(_)):_*))
	}



	// ______ __ __ __ __
	// /\ == \ /\ \/\ \ /\ "-.\ \
	// \ \ __< \ \ \_\ \ \ \ \-. \
	// \ \_\ \_\ \ \_____\ \ \_\\"\_\
	// \/_/ /_/ \/_____/ \/_/ \/_/

	val BUCKET = "./data"

	val df = read(spark, s"$BUCKET/avocado.csv")
	println(s"This file has ${df.count()} records.")
	df.printSchema()

	save(prepare(df), s"$BUCKET/avocado")

	println("That's all folks o/")
	sys.exit(0)
	#!/bin/bash -x
	# @script run.sh
	# @author Anthony Vilarim Caliani
	# @contact github.com/avcaliani
	#
	# @description
	# Script to execute Spark Shell jobs...
	#
	# @usage
	# ./run.sh
	spark-shell -I main.scala \
	&& rm -f data/.txt data/avocado/.crc \
	&& mv data/avocado/*.txt data/AVOCADO.TXT \
	&& rm -rf data/avocado \
	&& exit 0