Last active
July 4, 2017 05:48
-
-
Save manuthu/3fae80b5b99d7aa57060706d4bd2230d to your computer and use it in GitHub Desktop.
Deletes data from jobs already executed by etl. Pre [Macros](https://doc.dataiku.com/dss/latest/operations/macros.html) this worked
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ubuntu@etl:/etc/cron.daily$ cat delete-etl-dss-data-job.py | |
#!/usr/bin/env python | |
import datetime | |
import glob | |
import logging | |
import logging.handlers | |
import shutil | |
import subprocess | |
log = logging.getLogger(__name__) | |
log.setLevel(logging.DEBUG) | |
handler = logging.handlers.SysLogHandler(address = '/dev/log') | |
formatter = logging.Formatter('%(asctime)s:-> %(message)s') | |
handler.setFormatter(formatter) | |
log.addHandler(handler) | |
def delete_directories(directories): | |
for directory in directories: | |
log.debug('Delete size:[%s] directory:[%s]', du(directory), directory) | |
shutil.rmtree(directory) | |
def du(path): | |
command = ['du', '-sh', path] | |
return subprocess.check_output(command).split()[0].decode('utf8') | |
def today(): | |
t = datetime.datetime.today() | |
return str(t.year) + '-' + str(t.month).zfill(2) + '-' + str(t.day).zfill(2) | |
def get_delete_directories(path): | |
path_list = list() | |
for directory in glob.iglob(path): | |
if today() not in directory: | |
path_list.append(directory) | |
return path_list | |
if __name__ == '__main__': | |
PATH = '/home/dataiku/dss_data/jobs/USERPROFILEENRICHMENT/*' | |
log.debug('ETL log delete started at %s', PATH) | |
delete_directories(get_delete_directories(PATH)) |
Author
manuthu
commented
Jun 5, 2017
•
Moving the script to /usr/bin
ubuntu@etl:/etc/cron.daily$ cat /etc/cron.d/delete-dss-log-data
#
# Regular cron jobs
# Deletes dss data logs from etl.
#
# */5 * * * * root /usr/bin/dss/delete-dss-log-data.py
0 18 * * * root /usr/bin/dss/delete-dss-log-data.py
#!/usr/bin/env python
import datetime
import glob
import logging
import logging.handlers
import os
import shutil
import subprocess
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address='/dev/log')
formatter = logging.Formatter('%(asctime)s:-> %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)
def delete_directories(directories):
for directory in directories:
log.debug('Delete size:[%s] directory:[%s]', du(directory), directory)
# Remove regular files
if os.path.isfile(directory):
os.remove(directory)
# Remove directories. Assume that if not a file, its a directory
else:
shutil.rmtree(directory)
def du(path):
command = ['du', '-sh', path]
return subprocess.check_output(command).split()[0].decode('utf8')
def today():
t = datetime.date.today()
return str(t)
def yesterday():
yesterday = datetime.date.today() - datetime.timedelta(1)
return str(yesterday)
def get_delete_directories(path):
path_list = list()
for directory in glob.iglob(path):
if today() in directory or yesterday() in directory:
continue
path_list.append(directory)
return path_list
if __name__ == '__main__':
PATH = '/home/dataiku/dss_data/jobs/USERPROFILEENRICHMENT/*'
log.debug('ETL log delete started at %s', PATH)
delete_directories(get_delete_directories(PATH))
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment