Skip to content

Instantly share code, notes, and snippets.

@manuthu
Last active July 4, 2017 05:48
Show Gist options
  • Save manuthu/3fae80b5b99d7aa57060706d4bd2230d to your computer and use it in GitHub Desktop.
Save manuthu/3fae80b5b99d7aa57060706d4bd2230d to your computer and use it in GitHub Desktop.
Deletes data from jobs already executed by etl. Pre [Macros](https://doc.dataiku.com/dss/latest/operations/macros.html) this worked
ubuntu@etl:/etc/cron.daily$ cat delete-etl-dss-data-job.py
#!/usr/bin/env python
import datetime
import glob
import logging
import logging.handlers
import shutil
import subprocess
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address = '/dev/log')
formatter = logging.Formatter('%(asctime)s:-> %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)
def delete_directories(directories):
for directory in directories:
log.debug('Delete size:[%s] directory:[%s]', du(directory), directory)
shutil.rmtree(directory)
def du(path):
command = ['du', '-sh', path]
return subprocess.check_output(command).split()[0].decode('utf8')
def today():
t = datetime.datetime.today()
return str(t.year) + '-' + str(t.month).zfill(2) + '-' + str(t.day).zfill(2)
def get_delete_directories(path):
path_list = list()
for directory in glob.iglob(path):
if today() not in directory:
path_list.append(directory)
return path_list
if __name__ == '__main__':
PATH = '/home/dataiku/dss_data/jobs/USERPROFILEENRICHMENT/*'
log.debug('ETL log delete started at %s', PATH)
delete_directories(get_delete_directories(PATH))
@manuthu
Copy link
Author

manuthu commented Jun 5, 2017

ubuntu@etl:/etc/cron.daily$ cat delete-etl-dss-data-job.py
#!/usr/bin/env python
import datetime
import glob
import logging
import logging.handlers
import os
import shutil
import subprocess


log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address = '/dev/log')
formatter = logging.Formatter('%(asctime)s:-> %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)


def delete_directories(directories):
    for directory in directories:
        log.debug('Delete size:[%s] directory:[%s]', du(directory), directory)
        # Remove regular files
        if os.path.isfile(directory):
            os.remove(directory)
        # Remove directories. Assume that if not a file, its a directory
        else:
            shutil.rmtree(directory)


def du(path):
    command = ['du', '-sh', path]
    return subprocess.check_output(command).split()[0].decode('utf8')


def today():
    t = datetime.datetime.today()
    return str(t.year) + '-' + str(t.month).zfill(2) + '-' + str(t.day).zfill(2)


def yesterday():
    t = datetime.datetime.today()
    return str(t.year) + '-' + str(t.month).zfill(2) + '-' + str(t.day-1).zfill(2)


def get_delete_directories(path):
    path_list = list()
    for directory in glob.iglob(path):
        if today() in directory or yesterday() in directory:
            continue
        path_list.append(directory)
    return path_list


if __name__ == '__main__':
    PATH = '/home/dataiku/dss_data/jobs/USERPROFILEENRICHMENT/*'
    log.debug('ETL log delete started at %s', PATH)
    delete_directories(get_delete_directories(PATH))

@manuthu
Copy link
Author

manuthu commented Jun 12, 2017

Moving the script to /usr/bin

ubuntu@etl:/etc/cron.daily$ cat /etc/cron.d/delete-dss-log-data
#
# Regular cron jobs
# Deletes dss data logs from etl.
#

# */5 * * * * root /usr/bin/dss/delete-dss-log-data.py
0 18 * * * root /usr/bin/dss/delete-dss-log-data.py

@manuthu
Copy link
Author

manuthu commented Jul 4, 2017

#!/usr/bin/env python
import datetime
import glob
import logging
import logging.handlers
import os
import shutil
import subprocess


log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address='/dev/log')
formatter = logging.Formatter('%(asctime)s:-> %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)


def delete_directories(directories):
    for directory in directories:
        log.debug('Delete size:[%s] directory:[%s]', du(directory), directory)
        # Remove regular files
        if os.path.isfile(directory):
            os.remove(directory)
        # Remove directories. Assume that if not a file, its a directory
        else:
            shutil.rmtree(directory)


def du(path):
    command = ['du', '-sh', path]
    return subprocess.check_output(command).split()[0].decode('utf8')


def today():
    t = datetime.date.today()
    return str(t)


def yesterday():
    yesterday = datetime.date.today() - datetime.timedelta(1)
    return str(yesterday)


def get_delete_directories(path):
    path_list = list()
    for directory in glob.iglob(path):
        if today() in directory or yesterday() in directory:
            continue
        path_list.append(directory)
    return path_list


if __name__ == '__main__':
    PATH = '/home/dataiku/dss_data/jobs/USERPROFILEENRICHMENT/*'
    log.debug('ETL log delete started at %s', PATH)
    delete_directories(get_delete_directories(PATH))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment