Created
November 15, 2018 17:42
-
-
Save korkridake/b053b9b6ffb21951e23d4db4a25b010f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# GLOBAL CONFIGURATIONS | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# File location and type | |
# --------------------------------------------------------------- | |
file_location = "/FileStore/tables/Cars93.csv" | |
file_type = "csv" | |
# --------------------------------------------------------------- | |
# CSV options | |
# --------------------------------------------------------------- | |
infer_schema = "true" # automatically infers column types | |
first_row_is_header = "true" # header = TRUE | |
delimiter = "," # comma separator | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# The applied options are for CSV files. For other file types, these will be ignored. | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
df = spark.read.format(file_type) \ | |
.option("inferSchema", infer_schema) \ | |
.option("header", first_row_is_header) \ | |
.option("sep", delimiter) \ | |
.load(file_location) | |
display(df) | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# Run the Command Line in the Databricks Workspace | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
- %fs runs the command line syntax. | |
- ls = list the files in the directory | |
%fs | |
ls /FileStore/tables/ | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# Load CSV. | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
%python | |
df_2 = spark.read.csv("dbfs:/FileStore/tables/Cars93.csv", header="true", inferSchema="true") # shorter command than above | |
df_2 = df_2[['Model', 'Type', 'Price']] # choose just some columns (just like typical python commands) | |
df_2.show() # display the dataframe | |
df_2.printSchema() # check data structure | |
%scala | |
val diamonds = sqlContext.read.format("csv") | |
.option("header", "true") | |
.option("inferSchema", "true") | |
.load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv") | |
display(diamonds) | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
# Work with SQL | |
# --------------------------------------------------------------- | |
# --------------------------------------------------------------- | |
temp_table_name = "cars93_csv" | |
df.createOrReplaceTempView(temp_table_name) | |
%sql | |
SELECT * FROM `cars93_csv` | |
%sql | |
SELECT Manufacturer, Model, Type | |
FROM cars93_csv | |
WHERE Manufacturer == 'Acura' | |
%% END of EP.1 %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment