Zoltan C. Toth zoltanctoth

## split_text.py
long_text = 3000 * "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
COMPREHEND_LIMIT = 5000

lines = long_text.split(".")
current_text = ""
for line in lines:
    if len(current_text + line) > COMPREHEND_LIMIT:
        # EXECUTE COMPREHEND
        print(f"Executing Comprehend on {len(current_text)} characters")
        current_text = ""

## serving.py
import os

import mlflow
import uvicorn
from fastapi import FastAPI, HTTPException


class MLService:
    def __init__(self, model):
        self.model = model

## schema.yml
models:
   - name: dim_listings_w_hosts
    tests:
      - dbt_expectations.expect_table_row_count_to_equal_other_table:
          compare_model: source('airbnb', 'listings')
    columns:
      - name: price
        tests:
          - dbt_expectations.expect_column_values_to_be_of_type:
              column_type: number

## my_custom_test.sql
{% test valid_age(model, column_name) %}
SELECT COUNT(*) FROM {{ model }}
   WHERE NOT {{ column_name}} BETWEEN 0 and 125
{% endtest %}

## packages.yml
packages:
  - package: calogica/dbt_expectations
    version: [">=0.6.0", "<0.7.0"]

## print-without-newline.py
# Print a string without adding a newline
print("Hey, Python prints without a newline.", end ="")

# Alternative solution
import sys
sys.stdout.write("Hey, Python prints without a newline.")

# You are part of an experiment on how well gists can be used as "StackOverflow".
# Please add a comment or a star if you found this useful. :) Thanks!

## batch delete files s3
# Hint: If you are stuck by having tens of millions of files under an S3 Prefix, perhaps
#       the easiest is to set the prefix's Expiration to one day in the Lifecycle Management
#       pane of the bucket in the Web UI and Amazon will take care of the object deletion for you

# A good resource where I've gotten the scripts is this:
https://serverfault.com/questions/679989/most-efficient-way-to-batch-delete-s3-files#comment1200074_917740

# List all objects

aws s3api list-objects --output text --bucket <<BUCKET_NAME>> --query 'Contents[].[Key]' --prefix <<prefix, like tmp/sandbox>> | pv -l

## list_run_adf_pipeline.py
from azure.common.credentials import ServicePrincipalCredentials
from azure.mgmt.datafactory import DataFactoryManagementClient
from azure.mgmt.datafactory.models import *

subscription_id = '8d1dc324-4f8a-4be5-ae74-310e2f5596a5'
credentials = ServicePrincipalCredentials(client_id='dcf2637e-8f81-4bbb-a72e-ac2f291e328b', secret='<<  secret   >>', tenant='874cd0d6-f21a-4c6e-8239-51287476f635')
adf_client = DataFactoryManagementClient(credentials, subscription_id)
pipelines = adf_client.pipelines.list_by_factory("schneider-test", "Schneider-Test-Data-Factory")

for p in pipelines:

## save-and-load-native-lightgbm-model-mlflow
import lightgbm as lgb

# Imagine pipelineModel stages are [x, x, x, trainLightGBMModel]
model.stages[-1].saveNativeModel("/tmp/lightgbm")

nativeLGBModel = lgb.Booster(model_file="/dbfs/tmp/lightgbm/part-00000-tid-5517958219000636906-02c16955-a283-4198-a41a-cdbd78f5aae5-455-1-c000.txt")

mlflow.lightgbm.log_model(nativeLGBModel, artifact_path="lightgbm-model")

## Monitor Azure Costs.sh
#! /usr/bin/env bash
set -e

DAILY_SPENDING_LIMIT=3 # USD, per account

DATE_COMMAND="date"
if hash gdate 2>/dev/null
then
   DATE_COMMAND="gdate"
fi
	long_text = 3000 * "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
	COMPREHEND_LIMIT = 5000

	lines = long_text.split(".")
	current_text = ""
	for line in lines:
	if len(current_text + line) > COMPREHEND_LIMIT:
	# EXECUTE COMPREHEND
	print(f"Executing Comprehend on {len(current_text)} characters")
	current_text = ""
	import os

	import mlflow
	import uvicorn
	from fastapi import FastAPI, HTTPException


	class MLService:
	def __init__(self, model):
	self.model = model
	models:
	- name: dim_listings_w_hosts
	tests:
	- dbt_expectations.expect_table_row_count_to_equal_other_table:
	compare_model: source('airbnb', 'listings')
	columns:
	- name: price
	tests:
	- dbt_expectations.expect_column_values_to_be_of_type:
	column_type: number
	{% test valid_age(model, column_name) %}
	SELECT COUNT(*) FROM {{ model }}
	WHERE NOT {{ column_name}} BETWEEN 0 and 125
	{% endtest %}
	packages:
	- package: calogica/dbt_expectations
	version: [">=0.6.0", "<0.7.0"]
	# Print a string without adding a newline
	print("Hey, Python prints without a newline.", end ="")

	# Alternative solution
	import sys
	sys.stdout.write("Hey, Python prints without a newline.")

	# You are part of an experiment on how well gists can be used as "StackOverflow".
	# Please add a comment or a star if you found this useful. :) Thanks!
	from azure.common.credentials import ServicePrincipalCredentials
	from azure.mgmt.datafactory import DataFactoryManagementClient
	from azure.mgmt.datafactory.models import *

	subscription_id = '8d1dc324-4f8a-4be5-ae74-310e2f5596a5'
	credentials = ServicePrincipalCredentials(client_id='dcf2637e-8f81-4bbb-a72e-ac2f291e328b', secret='<< secret >>', tenant='874cd0d6-f21a-4c6e-8239-51287476f635')
	adf_client = DataFactoryManagementClient(credentials, subscription_id)
	pipelines = adf_client.pipelines.list_by_factory("schneider-test", "Schneider-Test-Data-Factory")

	for p in pipelines:
	import lightgbm as lgb

	# Imagine pipelineModel stages are [x, x, x, trainLightGBMModel]
	model.stages[-1].saveNativeModel("/tmp/lightgbm")

	nativeLGBModel = lgb.Booster(model_file="/dbfs/tmp/lightgbm/part-00000-tid-5517958219000636906-02c16955-a283-4198-a41a-cdbd78f5aae5-455-1-c000.txt")

	mlflow.lightgbm.log_model(nativeLGBModel, artifact_path="lightgbm-model")
	#! /usr/bin/env bash
	set -e

	DAILY_SPENDING_LIMIT=3 # USD, per account

	DATE_COMMAND="date"
	if hash gdate 2>/dev/null
	then
	DATE_COMMAND="gdate"
	fi