Skip to content

Instantly share code, notes, and snippets.

@korkridake
Created November 15, 2018 17:42
Show Gist options
  • Save korkridake/b053b9b6ffb21951e23d4db4a25b010f to your computer and use it in GitHub Desktop.
Save korkridake/b053b9b6ffb21951e23d4db4a25b010f to your computer and use it in GitHub Desktop.
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# GLOBAL CONFIGURATIONS
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# File location and type
# ---------------------------------------------------------------
file_location = "/FileStore/tables/Cars93.csv"
file_type = "csv"
# ---------------------------------------------------------------
# CSV options
# ---------------------------------------------------------------
infer_schema = "true" # automatically infers column types
first_row_is_header = "true" # header = TRUE
delimiter = "," # comma separator
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# The applied options are for CSV files. For other file types, these will be ignored.
# ---------------------------------------------------------------
# ---------------------------------------------------------------
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
display(df)
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Run the Command Line in the Databricks Workspace
# ---------------------------------------------------------------
# ---------------------------------------------------------------
- %fs runs the command line syntax.
- ls = list the files in the directory
%fs
ls /FileStore/tables/
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Load CSV.
# ---------------------------------------------------------------
# ---------------------------------------------------------------
%python
df_2 = spark.read.csv("dbfs:/FileStore/tables/Cars93.csv", header="true", inferSchema="true") # shorter command than above
df_2 = df_2[['Model', 'Type', 'Price']] # choose just some columns (just like typical python commands)
df_2.show() # display the dataframe
df_2.printSchema() # check data structure
%scala
val diamonds = sqlContext.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
display(diamonds)
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Work with SQL
# ---------------------------------------------------------------
# ---------------------------------------------------------------
temp_table_name = "cars93_csv"
df.createOrReplaceTempView(temp_table_name)
%sql
SELECT * FROM `cars93_csv`
%sql
SELECT Manufacturer, Model, Type
FROM cars93_csv
WHERE Manufacturer == 'Acura'
%% END of EP.1 %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment