rfmcnally/importio_download.py

## importio_download.py
""" Script for automatically downloading attachments from Import.io API """
import os
import sys
import csv
from datetime import datetime
import requests

EXTRACTORS_FILE = 'importio_extractors.txt'

def create_folder():
    """ Function for creating output folder from current time """
    current_time = datetime.now().strftime("%m-%d-%yT%H-%M")
    output_dir = '{0}_output'.format(current_time)
    os.makedirs(output_dir)
    return output_dir

def get_extractor_name(extractor_id):
    """ Function for getting name of a given extractor """
    url = 'https://store.import.io/extractor/{0}'.format(extractor_id)
    querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
    resp = requests.get(url, params=querystring)
    name = resp.json()['name']
    return name

def get_extractor_crawlruns(extractor_id):
    """ Function for getting list of crawlruns from a given extractor """
    url = 'https://store.import.io/crawlrun/_search'
    querystring = {'_sort': "_meta.creationTimestamp", '_page': 1, '_perpage': 1000, 'extractorId': extractor_id, '_apikey': os.environ['IMPORT_IO_API_KEY']}
    resp = requests.get(url, params=querystring)
    crawlruns = resp.json()['hits']['hits']
    return crawlruns

def generate_filename(extractor_name, crawlrun):
    """ Function for generating filename from extractor name and crawlrun time """
    stamp = datetime.fromtimestamp(int(crawlrun['fields']['stoppedAt']) / 1000).strftime("%m-%d-%yT%H-%M")
    filename = "{filename}_{timestamp}".format(filename=extractor_name, timestamp=stamp)
    return filename

def download_attachment(type, object_id, attachment):
    """ Function for downloading an attachment from the object store """
    url = 'https://store.import.io/{0}/{1}/_attachment/{2}'.format(type, object_id, attachment)
    querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
    resp = requests.get(url, params=querystring)
    return resp

def download_csv(crawl_id, output_path):
    """ Function for downloading CSV attachment from crawlrun """
    resp = download_attachment('crawlrun', crawl_id, 'csv')
    csv_resp = resp.content.decode('utf-8-sig')
    csv_output = csv_resp.splitlines()
    reader = csv.reader(csv_output, delimiter=',')
    with open(output_path + '.csv', 'w', encoding='utf-8-sig') as output:
        writer = csv.writer(output, delimiter=',')
        for row in reader:
            writer.writerow(row)
    return

def download_excel(crawl_id, output_path):
    """ Function for downloading Excel attachment from crawlrun """
    resp = download_attachment('crawlrun', crawl_id, 'xlsx')
    xlsx_output = open(output_path + '.xlsx', 'wb')
    xlsx_output.write(resp.content)
    xlsx_output.close()
    return

def main(text_file):
    """ Function for downloading files from list of extractor IDs"""
    extractor_ids = open(text_file).read().splitlines()
    if len(sys.argv) > 1:
        type = sys.argv[1]
    else:
        type = 'csv'
    output_dir = create_folder()
    for extractor_id in extractor_ids:
        extractor_name = get_extractor_name(extractor_id)
        crawlruns = get_extractor_crawlruns(extractor_id)
        for crawlrun in crawlruns:
            crawl_id = crawlrun['fields']['guid']
            crawl_state = crawlrun['fields']['state']
            if crawl_state == 'FINISHED':
                crawl_filename = generate_filename(extractor_name, crawlrun)
                output_path = os.path.join(output_dir, crawl_filename)
                if type == 'csv':
                    download_csv(crawl_id, output_path)
                elif type == 'xlsx':
                    download_excel(crawl_id, output_path)
                else:
                    print('Incorrect type defined: ' + type)
                print(crawl_filename + ' saved to ' + output_dir)
            else:
                print(crawl_id + ' attachment not downloaded.')
    print('All files saved.')
    return


if __name__ == '__main__':
    main(EXTRACTORS_FILE)
	""" Script for automatically downloading attachments from Import.io API """
	import os
	import sys
	import csv
	from datetime import datetime
	import requests

	EXTRACTORS_FILE = 'importio_extractors.txt'

	def create_folder():
	""" Function for creating output folder from current time """
	current_time = datetime.now().strftime("%m-%d-%yT%H-%M")
	output_dir = '{0}_output'.format(current_time)
	os.makedirs(output_dir)
	return output_dir

	def get_extractor_name(extractor_id):
	""" Function for getting name of a given extractor """
	url = 'https://store.import.io/extractor/{0}'.format(extractor_id)
	querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
	resp = requests.get(url, params=querystring)
	name = resp.json()['name']
	return name

	def get_extractor_crawlruns(extractor_id):
	""" Function for getting list of crawlruns from a given extractor """
	url = 'https://store.import.io/crawlrun/_search'
	querystring = {'_sort': "_meta.creationTimestamp", '_page': 1, '_perpage': 1000, 'extractorId': extractor_id, '_apikey': os.environ['IMPORT_IO_API_KEY']}
	resp = requests.get(url, params=querystring)
	crawlruns = resp.json()['hits']['hits']
	return crawlruns

	def generate_filename(extractor_name, crawlrun):
	""" Function for generating filename from extractor name and crawlrun time """
	stamp = datetime.fromtimestamp(int(crawlrun['fields']['stoppedAt']) / 1000).strftime("%m-%d-%yT%H-%M")
	filename = "{filename}_{timestamp}".format(filename=extractor_name, timestamp=stamp)
	return filename

	def download_attachment(type, object_id, attachment):
	""" Function for downloading an attachment from the object store """
	url = 'https://store.import.io/{0}/{1}/_attachment/{2}'.format(type, object_id, attachment)
	querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
	resp = requests.get(url, params=querystring)
	return resp

	def download_csv(crawl_id, output_path):
	""" Function for downloading CSV attachment from crawlrun """
	resp = download_attachment('crawlrun', crawl_id, 'csv')
	csv_resp = resp.content.decode('utf-8-sig')
	csv_output = csv_resp.splitlines()
	reader = csv.reader(csv_output, delimiter=',')
	with open(output_path + '.csv', 'w', encoding='utf-8-sig') as output:
	writer = csv.writer(output, delimiter=',')
	for row in reader:
	writer.writerow(row)
	return

	def download_excel(crawl_id, output_path):
	""" Function for downloading Excel attachment from crawlrun """
	resp = download_attachment('crawlrun', crawl_id, 'xlsx')
	xlsx_output = open(output_path + '.xlsx', 'wb')
	xlsx_output.write(resp.content)
	xlsx_output.close()
	return

	def main(text_file):
	""" Function for downloading files from list of extractor IDs"""
	extractor_ids = open(text_file).read().splitlines()
	if len(sys.argv) > 1:
	type = sys.argv[1]
	else:
	type = 'csv'
	output_dir = create_folder()
	for extractor_id in extractor_ids:
	extractor_name = get_extractor_name(extractor_id)
	crawlruns = get_extractor_crawlruns(extractor_id)
	for crawlrun in crawlruns:
	crawl_id = crawlrun['fields']['guid']
	crawl_state = crawlrun['fields']['state']
	if crawl_state == 'FINISHED':
	crawl_filename = generate_filename(extractor_name, crawlrun)
	output_path = os.path.join(output_dir, crawl_filename)
	if type == 'csv':
	download_csv(crawl_id, output_path)
	elif type == 'xlsx':
	download_excel(crawl_id, output_path)
	else:
	print('Incorrect type defined: ' + type)
	print(crawl_filename + ' saved to ' + output_dir)
	else:
	print(crawl_id + ' attachment not downloaded.')
	print('All files saved.')
	return


	if __name__ == '__main__':
	main(EXTRACTORS_FILE)