Dave Ruijter DaveRuijter

## generate_hash.py

spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string")

# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed.
def udf_centralhash(string: str) -> int:
    val = hashlib.blake2b(
        digest_size=6
    )  # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions.
    val.update(string.encode("utf-8"))  # give the input string as utf-8 to the blake2b object
    intval = int(val.hexdigest(), 16)  # and convert it to an integer

## multicolumn_expression_evaluation.py
from great_expectations.expectations.expectation import MulticolumnMapExpectation
from great_expectations.expectations.util import render_evaluation_parameter_string
from great_expectations.render.util import (
    num_to_str,
    substitute_none_for_missing,
    parse_row_condition_string_pandas_engine,
)
from scipy import stats as stats
from great_expectations.execution_engine import (
    PandasExecutionEngine,

## is_pipeline_running.json
{
	"name": "00_is_pipeline_running",
	"properties": {
		"activities": [
			{
				"name": "Get Pipeline Runs",
				"type": "WebActivity",
				"dependsOn": [
					{
						"activity": "getSubscriptionID",

## pipeline-backup-daily.yml
parameters:
  - name: backupStore
    displayName: 'Backup 05 store'
    type: boolean
    default: true
  - name: backupBronze
    displayName: 'Backup 10 bronze'
    type: boolean
    default: true
  - name: backupSilver

## pipeline-backup-weekly.yml
parameters:
  - name: backupStore
    displayName: 'Backup 05 store'
    type: boolean
    default: true
  - name: backupBronze
    displayName: 'Backup 10 bronze'
    type: boolean
    default: true
  - name: backupSilver

## stage-backup-dls.yml
parameters:
  - name: dependsOnStage
    type: string
  - name: triggerPeriod
    type: string
  - name: environment
    type: string
  - name: backupStore
    type: boolean
  - name: backupBronze

## job-backup-dls.yml
parameters:
  - name: backups
    displayName: 'Array of backups'
    type: object
    default: []
  - name: serviceConnectionName
    displayName: 'Name of the DevOps Service Connection'
    type: string
  - name: execute
    displayName: 'Execute this Job'

## backup-dls.ps1
param(
    [String]$sourceStorageAccount,
    [String]$targetStorageAccount,
    [String]$sourceFolder,
    [String]$targetFolder,
    [String]$sourceSasToken,
    [String]$targetSasToken,
    [String]$triggerPeriod,
    [Int32]$azCopyConcurrency
)

## data_lake_sta_lifecycle_policy_rules.json
{
  "rules": [
    {
      "enabled": true,
      "name": "daily-moving-data-lake-store-archive-to-cool",
      "type": "Lifecycle",
      "definition": {
        "actions": {
          "baseBlob": {
            "tierToCool": {

## backup_sta_lifecycle_policy_rules.json
{
  "rules": [
    {
      "enabled": true,
      "name": "weeklybackupsrule",
      "type": "Lifecycle",
      "definition": {
        "actions": {
          "baseBlob": {
            "delete": {

	spark.udf.register("udf_removehtmltagsfromstring", udf_removehtmltagsfromstring, "string")

	# This is the central hashing function, used by other functions. It uses the blake2b hashing algorithm. With a central function, we can adjust the hashing when needed.
	def udf_centralhash(string: str) -> int:
	val = hashlib.blake2b(
	digest_size=6
	) # Increase digest size to make the hashing bigger. 6 seems a good start for our use for dimensions.
	val.update(string.encode("utf-8")) # give the input string as utf-8 to the blake2b object
	intval = int(val.hexdigest(), 16) # and convert it to an integer
	from great_expectations.expectations.expectation import MulticolumnMapExpectation
	from great_expectations.expectations.util import render_evaluation_parameter_string
	from great_expectations.render.util import (
	num_to_str,
	substitute_none_for_missing,
	parse_row_condition_string_pandas_engine,
	)
	from scipy import stats as stats
	from great_expectations.execution_engine import (
	PandasExecutionEngine,
	{
	"name": "00_is_pipeline_running",
	"properties": {
	"activities": [
	{
	"name": "Get Pipeline Runs",
	"type": "WebActivity",
	"dependsOn": [
	{
	"activity": "getSubscriptionID",
	parameters:
	- name: backupStore
	displayName: 'Backup 05 store'
	type: boolean
	default: true
	- name: backupBronze
	displayName: 'Backup 10 bronze'
	type: boolean
	default: true
	- name: backupSilver
	parameters:
	- name: dependsOnStage
	type: string
	- name: triggerPeriod
	type: string
	- name: environment
	type: string
	- name: backupStore
	type: boolean
	- name: backupBronze
	parameters:
	- name: backups
	displayName: 'Array of backups'
	type: object
	default: []
	- name: serviceConnectionName
	displayName: 'Name of the DevOps Service Connection'
	type: string
	- name: execute
	displayName: 'Execute this Job'
	param(
	[String]$sourceStorageAccount,
	[String]$targetStorageAccount,
	[String]$sourceFolder,
	[String]$targetFolder,
	[String]$sourceSasToken,
	[String]$targetSasToken,
	[String]$triggerPeriod,
	[Int32]$azCopyConcurrency
	)
	{
	"rules": [
	{
	"enabled": true,
	"name": "daily-moving-data-lake-store-archive-to-cool",
	"type": "Lifecycle",
	"definition": {
	"actions": {
	"baseBlob": {
	"tierToCool": {
	{
	"rules": [
	{
	"enabled": true,
	"name": "weeklybackupsrule",
	"type": "Lifecycle",
	"definition": {
	"actions": {
	"baseBlob": {
	"delete": {