Last active
December 3, 2020 12:49
-
-
Save 7yl4r/7b7e90d1155337c0a31ad9e285a85e08 to your computer and use it in GitHub Desktop.
performance-related sanitized airflow config options
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[core] | |
executor = CeleryExecutor | |
sql_alchemy_conn = mysql://USER:PW@AIRFLOW_MASTER_SERVER/airflowtable | |
# The SqlAlchemy pool size is the maximum number of database connections | |
# in the pool. | |
sql_alchemy_pool_size = 100 | |
# The SqlAlchemy pool recycle is the number of seconds a connection | |
# can be idle in the pool before it is invalidated. This config does | |
# not apply to sqlite. | |
sql_alchemy_pool_recycle = 3600 | |
# The amount of parallelism as a setting to the executor. This defines | |
# the max number of task instances that should run simultaneously | |
# on this airflow installation | |
parallelism = 64 | |
# The number of task instances allowed to run concurrently by the scheduler | |
dag_concurrency = 32 | |
# When not using pools, tasks are run in the "default pool", | |
# whose size is guided by this config element | |
non_pooled_task_slot_count = 256 | |
# The maximum number of active DAG runs per DAG | |
max_active_runs_per_dag = 99 | |
# Whether to disable pickling dags | |
donot_pickle = False | |
# How long before timing out a python file import while filling the DagBag | |
dagbag_import_timeout = 30 | |
# The class to use for running task instances in a subprocess | |
task_runner = BashTaskRunner | |
[operators] | |
# The default owner assigned to each new operator, unless | |
# provided explicitly or passed via `default_args` | |
default_owner = Airflow | |
default_cpus = 4 | |
default_ram = 512 | |
default_disk = 512 | |
default_gpus = 0 | |
[webserver] | |
# Number of seconds the gunicorn webserver waits before timing out on a worker | |
web_server_worker_timeout = 120 | |
# Number of workers to refresh at a time. When set to 0, worker refresh is | |
# disabled. When nonzero, airflow periodically refreshes webserver workers by | |
# bringing up new ones and killing old ones. | |
worker_refresh_batch_size = 1 | |
# Number of seconds to wait before refreshing a batch of workers. | |
worker_refresh_interval = 30 | |
# Number of workers to run the Gunicorn web server | |
workers = 4 | |
# The worker class gunicorn should use. Choices include | |
# sync (default), eventlet, gevent | |
worker_class = sync | |
[celery] | |
# The app name that will be used by celery | |
celery_app_name = airflow.executors.celery_executor | |
# The concurrency that will be used when starting workers with the | |
# "airflow worker" command. This defines the number of task instances that | |
# a worker will take, so size up your workers based on the resources on | |
# your worker box and the nature of your tasks | |
#celeryd_concurrency = 16 | |
celeryd_concurrency = 32 | |
# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally | |
# a sqlalchemy database. Refer to the Celery documentation for more | |
# information. | |
broker_url = amqp://MY_AIRFLOW_USER:MY_AIRFLOW_PW@airflowmaster/airflow_vhost | |
# Another key Celery setting | |
# this can be == to broker_url or to sql_alchemy_conn | |
celery_result_backend = db+mysql://MY_AIRFLOW_USER:MY_AIRFLOW_PW@airflowmaster/airflow | |
# Default queue that tasks get assigned to and that worker listen on. | |
default_queue = default | |
[scheduler] | |
# Task instances listen for external kill signal (when you clear tasks | |
# from the CLI or the UI), this defines the frequency at which they should | |
# listen (in seconds). | |
job_heartbeat_sec = 5 | |
# The scheduler constantly tries to trigger new tasks (look at the | |
# scheduler section in the docs for more information). This defines | |
# how often the scheduler should run (in seconds). | |
scheduler_heartbeat_sec = 1 | |
# after how much time should the scheduler terminate in seconds | |
# -1 indicates to run continuously (see also num_runs) | |
#run_duration = -1 | |
run_duration = 600 | |
# after how much time a new DAGs should be picked up from the filesystem | |
min_file_process_interval = 0 | |
dag_dir_list_interval = 300 | |
# How often should stats be printed to the logs | |
print_stats_interval = 30 | |
# Local task jobs periodically heartbeat to the DB. If the job has | |
# not heartbeat in this many seconds, the scheduler will mark the | |
# associated task instance as failed and will re-schedule the task. | |
scheduler_zombie_task_threshold = 300 | |
# Turn off scheduler catchup by setting this to False. | |
# Default behavior is unchanged and | |
# Command Line Backfills still work, but the scheduler | |
# will not do scheduler catchup if this is False, | |
# however it can be set on a per DAG basis in the | |
# DAG definition (catchup) | |
catchup_by_default = True | |
# Statsd (https://github.com/etsy/statsd) integration settings | |
statsd_on = False | |
statsd_host = localhost | |
statsd_port = 8125 | |
statsd_prefix = airflow | |
# The scheduler can run multiple threads in parallel to schedule dags. | |
# This defines how many threads will run. However airflow will never | |
# use more threads than the amount of cpu cores available. | |
max_threads = 64 | |
authenticate = False | |
[mesos] | |
# Number of cpu cores required for running one task instance using | |
# 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>' | |
# command on a mesos slave | |
task_cpu = 1 | |
# Memory in MB required for running one task instance using | |
# 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>' | |
# command on a mesos slave | |
task_memory = 256 | |
# Enable framework checkpointing for mesos | |
# See http://mesos.apache.org/documentation/latest/slave-recovery/ | |
checkpoint = False | |
# Failover timeout in milliseconds. | |
# When checkpointing is enabled and this option is set, Mesos waits | |
# until the configured timeout for | |
# the MesosExecutor framework to re-register after a failover. Mesos | |
# shuts down running tasks if the | |
# MesosExecutor framework fails to re-register within this timeframe. | |
# failover_timeout = 604800 | |
[kerberos] | |
reinit_frequency = 3600 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DEF_ARGS = { | |
'start_date': datetime.utcnow(), | |
'owner': 'af-user', | |
'depends_on_past': False, | |
'email': ['yada@yada.yada'], | |
'email_on_failure': False, | |
'email_on_retry': False, | |
'retries': 1, | |
'retry_delay': timedelta(hours=24), | |
'queue': 'default', | |
'pool': None, | |
'priority_weight': 1, | |
} | |
this_dag = DAG( | |
dag_id="proc_myd01_to_myd0_otis_l2_gom", | |
default_args=DEF_ARGS, | |
schedule_interval=None | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment