Created
January 16, 2022 23:12
-
-
Save RachidAZ/7935d7c8300f53320014cfbb95577ff6 to your computer and use it in GitHub Desktop.
quick access to data lake (ADLS gen2) from Databricks, save DataFrame as one partition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set access info , disclaimer: this is not the best way to access your data from security perspective. | |
spark.conf.set( | |
"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", | |
"{storage_key_here}" | |
) | |
import datetime | |
now = datetime.datetime.now() | |
filePath='abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/raw/data01/'+str(now.year)+'/'+'{:02d}'.format(now.month)+'/*.csv' | |
spark.conf.set('f.filePath',filePath) | |
%sql | |
CREATE OR REPLACE TEMPORARY VIEW V_SomeView | |
USING CSV | |
OPTIONS (path '${f.filePath}', header "true", mode "FAILFAST") | |
# save the output after transformation/cleansing.. | |
filePath_fact='' | |
spark.conf.set('f.filePath_fact',filePath_fact) | |
df=spark.sql("select * from V_SomeView") | |
df \ | |
.repartition(1) \ | |
.write.format("com.databricks.spark.csv") \ | |
.mode("overwrite") \ | |
.option("header", "true") \ | |
.save(filePath_fact+"fact02") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment