Skip to content

Instantly share code, notes, and snippets.

@dvu4
Last active April 18, 2023 15:27
Show Gist options
  • Save dvu4/231c96186cc72f33d8d21234db0900ad to your computer and use it in GitHub Desktop.
Save dvu4/231c96186cc72f33d8d21234db0900ad to your computer and use it in GitHub Desktop.

!pip install pyyaml

import yaml
from pyspark.dbutils import DBUtils
import pyspark.sql.functions as F
import time


def get_lastest_file(path):
  lists = dbutils.fs.ls(path)
  list_of_files = [(file.path,  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(file.modificationTime/ 1000))) for file in lists]
  lastest_file = max(list_of_files, key=lambda x: x[1])
  return lastest_file

Load file from tdtrans storage account

env = "prodfix"
workspace_type = "t"
short_code = "dtrans"

sa = f"{env}dseus2{workspace_type}{short_code}sa01"
container = 'ds-tdtrans-landing'

path = f"abfss://{container}@{sa}.dfs.core.windows.net/"
lastest_file = get_lastest_file(path) 
file_path = lastest_file[0]


# Load spi into dataframe
df = spark.read.format("csv").load(file_path, inferSchema = True, header = True)

# Adopt business user format
df = df.withColumnRenamed('EID', 'consumer_src_mid').select(F.col('consumer_src_mid'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment