Skip to content

Instantly share code, notes, and snippets.

@geraldstanje
Created January 7, 2023 18:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save geraldstanje/97223d34e5a95fa649cb377492be9410 to your computer and use it in GitHub Desktop.
Save geraldstanje/97223d34e5a95fa649cb377492be9410 to your computer and use it in GitHub Desktop.
databricks file
custom:
mm_tags: &mm-tags
"Application Type": "Back End"
Product: Optimization
"Sub Department": "Data Science"
Department: Engineering
"Service Name": cmv
"Repo Name": cmv-st
"Purpose": prod
"Category": Production
mm_dev_tags: &mm-dev-tags
"Application Type": "Back End"
Product: Optimization
"Sub Department": "Data Science"
Department: Engineering
"Service Name": cmv
"Repo Name": cmv-st
"Purpose": dev
"Category": Development
model-cluster-props: &model-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "i3.4xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
spark_conf:
spark.master: "local[*, 4]"
spark.databricks.cluster.profile: "singleNode"
aws_attributes:
"first_on_demand": 1
"availability": "ON_DEMAND"
"zone_id": "us-east-1e"
custom_tags:
<<: *mm-tags
model-dev-cluster-props: &model-dev-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "i3.4xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
spark_conf:
spark.master: "local[*, 4]"
spark.databricks.cluster.profile: "singleNode"
aws_attributes:
"first_on_demand": 1
"availability": "ON_DEMAND"
"zone_id": "us-east-1e"
custom_tags:
<<: *mm-dev-tags
etl-cluster-props: &etl-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "r4.4xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
aws_attributes:
"first_on_demand": 1
"availability": "SPOT_WITH_FALLBACK"
"zone_id": "us-east-1e"
"spot_bid_price_percent": 100
"ebs_volume_type": "GENERAL_PURPOSE_SSD"
"ebs_volume_count": 1
"ebs_volume_size": 100
custom_tags:
<<: *mm-tags
etl-dev-cluster-props: &etl-dev-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "r4.4xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
aws_attributes:
"first_on_demand": 1
"availability": "SPOT_WITH_FALLBACK"
"zone_id": "us-east-1e"
"spot_bid_price_percent": 100
"ebs_volume_type": "GENERAL_PURPOSE_SSD"
"ebs_volume_count": 1
"ebs_volume_size": 100
custom_tags:
<<: *mm-dev-tags
curve-cluster-props: &curve-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "r4.2xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
driver_node_type_id: "r4.8xlarge"
spark_conf:
spark.driver.maxResultSize: 0
aws_attributes:
"first_on_demand": 1
"availability": "ON_DEMAND"
"zone_id": "us-east-1e"
"ebs_volume_type": "GENERAL_PURPOSE_SSD"
"ebs_volume_count": 1
"ebs_volume_size": 100
custom_tags:
<<: *mm-tags
curve-dev-cluster-props: &curve-dev-cluster-props
spark_version: "10.4.x-cpu-ml-scala2.12"
node_type_id: "r4.2xlarge"
init_scripts:
- dbfs:
"destination": "dbfs:/databricks/install_lzo_and_configure.sh"
driver_node_type_id: "r4.8xlarge"
spark_conf:
spark.driver.maxResultSize: 0
aws_attributes:
"first_on_demand": 1
"availability": "ON_DEMAND"
"zone_id": "us-east-1e"
"ebs_volume_type": "GENERAL_PURPOSE_SSD"
"ebs_volume_count": 1
"ebs_volume_size": 100
custom_tags:
<<: *mm-dev-tags
etl-auto-scale-props: &etl-auto-scale-props
autoscale:
min_workers: 2
max_workers: 8
curve-auto-scale-props: &curve-auto-scale-props
autoscale:
min_workers: 2
max_workers: 8
etl-static-cluster: &etl-static-cluster
new_cluster:
<<: *etl-cluster-props
num_workers: 2
model-static-cluster: &model-static-cluster
new_cluster:
<<: *model-cluster-props
num_workers: 0
model-dev-static-cluster: &model-dev-static-cluster
new_cluster:
<<: *model-dev-cluster-props
num_workers: 0
etl-autoscale-cluster: &etl-autoscale-cluster
new_cluster:
<<: # merge these two maps and place them here.
- *etl-cluster-props
- *etl-auto-scale-props
etl-dev-autoscale-cluster: &etl-dev-autoscale-cluster
new_cluster:
<<: # merge these two maps and place them here.
- *etl-dev-cluster-props
- *etl-auto-scale-props
curve-autoscale-cluster: &curve-autoscale-cluster
new_cluster:
<<: # merge these two maps and place them here.
- *curve-cluster-props
- *curve-auto-scale-props
curve-dev-autoscale-cluster: &curve-dev-autoscale-cluster
new_cluster:
<<: # merge these two maps and place them here.
- *curve-dev-cluster-props
- *curve-auto-scale-props
build:
python: "poetry"
environments:
default:
workflows:
- name: "bidstat-reader"
<<: *etl-dev-autoscale-cluster
email_notifications:
on_start: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_success: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_failure: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
no_alert_for_skipped_runs: false
spark_python_task:
python_file: "file://src/bidstat_reader.py"
- name: "experiment"
<<: *model-dev-static-cluster
max_concurrent_runs: 3
spark_python_task:
python_file: "file://src/experiment_tf.py"
- name: "curve"
<<: *curve-dev-autoscale-cluster
max_concurrent_runs: 3
spark_python_task:
python_file: "file://src/curve.py"
- name: "evaluate"
<<: *curve-dev-autoscale-cluster
spark_python_task:
python_file: "file://src/evaluate_tf.py"
- name: "predict"
<<: *model-dev-static-cluster
spark_python_task:
python_file: "file://src/predict.py"
- name: "model_plot"
<<: *model-dev-static-cluster
spark_python_task:
python_file: "file://src/model_plot.py"
dev:
workflows:
- name: "cmv3-dev"
format: MULTI_TASK
email_notifications:
on_start: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_success: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_failure: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
no_alert_for_skipped_runs: false
job_clusters:
- job_cluster_key: "etl-cluster"
<<: *etl-dev-autoscale-cluster
- job_cluster_key: "model-cluster"
<<: *model-dev-static-cluster
- job_cluster_key: "curve-cluster"
<<: *curve-dev-autoscale-cluster
tasks:
- task_key: "bidstat-reader"
job_cluster_key: "etl-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/bidstat_reader.py"
parameters: [ "--config-file", "generate_data_dev.yaml"]
- task_key: "model-generation"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/experiment_tf.py"
parameters: [ "--config-file", "model_dev.yaml"]
depends_on:
- task_key: "bidstat-reader"
- task_key: "curve-building"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/curve.py"
parameters: [ "--config-file", "model_dev.yaml" ]
depends_on:
- task_key: "model-generation"
- task_key: "evaluate-results"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/evaluate_tf.py"
parameters: [ "--config-file", "model_dev.yaml"]
depends_on:
- task_key: "curve-building"
- task_key: "copy-artifacts"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/serving_artifacts.py"
parameters: [ "--config-file", "model_dev.yaml"]
depends_on:
- task_key: "curve-building"
- task_key: "model-plot"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/model_plot.py"
parameters: [ "--config-file", "model_dev.yaml" ]
depends_on:
- task_key: "copy-artifacts"
prod_model_a:
workflows:
- name: "cmv3-prod-model-a"
format: MULTI_TASK
email_notifications:
on_start: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_success: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_failure: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
no_alert_for_skipped_runs: false
job_clusters:
- job_cluster_key: "etl-cluster"
<<: *etl-autoscale-cluster
- job_cluster_key: "model-cluster"
<<: *model-static-cluster
- job_cluster_key: "curve-cluster"
<<: *curve-autoscale-cluster
max_concurrent_runs: 2
tasks:
- task_key: "bidstat-reader"
job_cluster_key: "etl-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/bidstat_reader.py"
parameters: ["--config-file", "generate_data_cb_prod_310.yaml", "--start_date", "T-4"]
- task_key: "model-generation"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/experiment_tf.py"
parameters: ["--config-file", "model_cb_prod_310.yaml", "--model_date", "T-1"]
depends_on:
- task_key: "bidstat-reader"
- task_key: "curve-building"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/curve.py"
parameters: ["--config-file", "model_cb_prod_310.yaml", "--model_date", "T-1"]
depends_on:
- task_key: "model-generation"
- task_key: "evaluate-results"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/evaluate_tf.py"
parameters: ["--config-file", "model_cb_prod_310.yaml", "--model_date", "T-1"]
depends_on:
- task_key: "curve-building"
- task_key: "copy-artifacts"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/serving_artifacts.py"
parameters: [ "--config-file", "model_cb_prod_310.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "curve-building"
- task_key: "model-plot"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/model_plot.py"
parameters: [ "--config-file", "model_cb_prod_310.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "copy-artifacts"
prod_model_b:
workflows:
- name: "cmv3-prod-model-b"
format: MULTI_TASK
email_notifications:
on_start: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_success: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
on_failure: [ "asadagopan@mediamath.com", "gstanje@mediamath.com", "wronsiek@mediamath.com" ]
no_alert_for_skipped_runs: false
job_clusters:
- job_cluster_key: "etl-cluster"
<<: *etl-autoscale-cluster
- job_cluster_key: "model-cluster"
<<: *model-static-cluster
- job_cluster_key: "curve-cluster"
<<: *curve-autoscale-cluster
max_concurrent_runs: 2
tasks:
- task_key: "bidstat-reader"
job_cluster_key: "etl-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/bidstat_reader.py"
parameters: [ "--config-file", "generate_data_cb_prod_320.yaml", "--start_date", "T-7" ]
- task_key: "model-generation"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/experiment_tf.py"
parameters: [ "--config-file", "model_cb_prod_320.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "bidstat-reader"
- task_key: "curve-building"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/curve.py"
parameters: [ "--config-file", "model_cb_prod_320.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "model-generation"
- task_key: "evaluate-results"
job_cluster_key: "curve-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/evaluate_tf.py"
parameters: [ "--config-file", "model_cb_prod_320.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "curve-building"
- task_key: "copy-artifacts"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/serving_artifacts.py"
parameters: [ "--config-file", "model_cb_prod_320.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "curve-building"
- task_key: "model-plot"
job_cluster_key: "model-cluster"
max_retries: 0
spark_python_task:
python_file: "file://src/model_plot.py"
parameters: [ "--config-file", "model_cb_prod_320.yaml", "--model_date", "T-1" ]
depends_on:
- task_key: "copy-artifacts"
data:
workflows:
- name: "bidstat-reader"
spark_python_task:
python_file: "file://src/bidstat_reader.py"
experiment:
workflows:
- name: "experiment"
spark_python_task:
python_file: "file://src/experiment_tf.py"
curve:
workflows:
- name: "curve"
spark_python_task:
python_file: "file://src/curve.py"
evaluate:
workflows:
- name: "evaluate"
spark_python_task:
python_file: "file://src/evaluate_tf.py"
predict:
workflows:
- name: "predict"
spark_python_task:
python_file: "file://src/predict.py"
model_plot:
workflows:
- name: "model_plot"
spark_python_task:
python_file: "file://src/model_plot.py"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment