korkridake/databricks_ep_1.txt

## databricks_ep_1.txt
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# GLOBAL CONFIGURATIONS
# ---------------------------------------------------------------
# ---------------------------------------------------------------

# ---------------------------------------------------------------
# File location and type
# ---------------------------------------------------------------
file_location = "/FileStore/tables/Cars93.csv"
file_type = "csv"

# ---------------------------------------------------------------
# CSV options
# ---------------------------------------------------------------
infer_schema = "true" # automatically infers column types
first_row_is_header = "true" # header = TRUE
delimiter = "," # comma separator

# ---------------------------------------------------------------
# ---------------------------------------------------------------
# The applied options are for CSV files. For other file types, these will be ignored.
# ---------------------------------------------------------------
# ---------------------------------------------------------------
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Run the Command Line in the Databricks Workspace
# ---------------------------------------------------------------
# ---------------------------------------------------------------
- %fs runs the command line syntax.
- ls = list the files in the directory

%fs
ls /FileStore/tables/

# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Load CSV.
# ---------------------------------------------------------------
# ---------------------------------------------------------------
%python
df_2 = spark.read.csv("dbfs:/FileStore/tables/Cars93.csv", header="true", inferSchema="true") # shorter command than above
df_2 = df_2[['Model', 'Type', 'Price']] # choose just some columns (just like typical python commands)
df_2.show() # display the dataframe
df_2.printSchema() # check data structure

%scala
val diamonds = sqlContext.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
display(diamonds)

# ---------------------------------------------------------------
# ---------------------------------------------------------------
# Work with SQL
# ---------------------------------------------------------------
# ---------------------------------------------------------------
temp_table_name = "cars93_csv"
df.createOrReplaceTempView(temp_table_name)

%sql
SELECT * FROM `cars93_csv`

%sql
SELECT Manufacturer, Model, Type
FROM cars93_csv
WHERE Manufacturer == 'Acura'


%% END of EP.1 %%
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	# GLOBAL CONFIGURATIONS
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------

	# ---------------------------------------------------------------
	# File location and type
	# ---------------------------------------------------------------
	file_location = "/FileStore/tables/Cars93.csv"
	file_type = "csv"

	# ---------------------------------------------------------------
	# CSV options
	# ---------------------------------------------------------------
	infer_schema = "true" # automatically infers column types
	first_row_is_header = "true" # header = TRUE
	delimiter = "," # comma separator

	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	# The applied options are for CSV files. For other file types, these will be ignored.
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	df = spark.read.format(file_type) \
	.option("inferSchema", infer_schema) \
	.option("header", first_row_is_header) \
	.option("sep", delimiter) \
	.load(file_location)

	display(df)

	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	# Run the Command Line in the Databricks Workspace
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	- %fs runs the command line syntax.
	- ls = list the files in the directory

	%fs
	ls /FileStore/tables/

	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	# Load CSV.
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	%python
	df_2 = spark.read.csv("dbfs:/FileStore/tables/Cars93.csv", header="true", inferSchema="true") # shorter command than above
	df_2 = df_2[['Model', 'Type', 'Price']] # choose just some columns (just like typical python commands)
	df_2.show() # display the dataframe
	df_2.printSchema() # check data structure

	%scala
	val diamonds = sqlContext.read.format("csv")
	.option("header", "true")
	.option("inferSchema", "true")
	.load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
	display(diamonds)

	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	# Work with SQL
	# ---------------------------------------------------------------
	# ---------------------------------------------------------------
	temp_table_name = "cars93_csv"
	df.createOrReplaceTempView(temp_table_name)

	%sql
	SELECT * FROM `cars93_csv`

	%sql
	SELECT Manufacturer, Model, Type
	FROM cars93_csv
	WHERE Manufacturer == 'Acura'


	%% END of EP.1 %%