Created
March 8, 2018 00:18
-
-
Save rfmcnally/a0f5326d05ca993da659b4e4a04400c3 to your computer and use it in GitHub Desktop.
Script for downloading all CSV or Excel files from a list of Import.io Extractors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Script for automatically downloading attachments from Import.io API """ | |
import os | |
import sys | |
import csv | |
from datetime import datetime | |
import requests | |
EXTRACTORS_FILE = 'importio_extractors.txt' | |
def create_folder(): | |
""" Function for creating output folder from current time """ | |
current_time = datetime.now().strftime("%m-%d-%yT%H-%M") | |
output_dir = '{0}_output'.format(current_time) | |
os.makedirs(output_dir) | |
return output_dir | |
def get_extractor_name(extractor_id): | |
""" Function for getting name of a given extractor """ | |
url = 'https://store.import.io/extractor/{0}'.format(extractor_id) | |
querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']} | |
resp = requests.get(url, params=querystring) | |
name = resp.json()['name'] | |
return name | |
def get_extractor_crawlruns(extractor_id): | |
""" Function for getting list of crawlruns from a given extractor """ | |
url = 'https://store.import.io/crawlrun/_search' | |
querystring = {'_sort': "_meta.creationTimestamp", '_page': 1, '_perpage': 1000, 'extractorId': extractor_id, '_apikey': os.environ['IMPORT_IO_API_KEY']} | |
resp = requests.get(url, params=querystring) | |
crawlruns = resp.json()['hits']['hits'] | |
return crawlruns | |
def generate_filename(extractor_name, crawlrun): | |
""" Function for generating filename from extractor name and crawlrun time """ | |
stamp = datetime.fromtimestamp(int(crawlrun['fields']['stoppedAt']) / 1000).strftime("%m-%d-%yT%H-%M") | |
filename = "{filename}_{timestamp}".format(filename=extractor_name, timestamp=stamp) | |
return filename | |
def download_attachment(type, object_id, attachment): | |
""" Function for downloading an attachment from the object store """ | |
url = 'https://store.import.io/{0}/{1}/_attachment/{2}'.format(type, object_id, attachment) | |
querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']} | |
resp = requests.get(url, params=querystring) | |
return resp | |
def download_csv(crawl_id, output_path): | |
""" Function for downloading CSV attachment from crawlrun """ | |
resp = download_attachment('crawlrun', crawl_id, 'csv') | |
csv_resp = resp.content.decode('utf-8-sig') | |
csv_output = csv_resp.splitlines() | |
reader = csv.reader(csv_output, delimiter=',') | |
with open(output_path + '.csv', 'w', encoding='utf-8-sig') as output: | |
writer = csv.writer(output, delimiter=',') | |
for row in reader: | |
writer.writerow(row) | |
return | |
def download_excel(crawl_id, output_path): | |
""" Function for downloading Excel attachment from crawlrun """ | |
resp = download_attachment('crawlrun', crawl_id, 'xlsx') | |
xlsx_output = open(output_path + '.xlsx', 'wb') | |
xlsx_output.write(resp.content) | |
xlsx_output.close() | |
return | |
def main(text_file): | |
""" Function for downloading files from list of extractor IDs""" | |
extractor_ids = open(text_file).read().splitlines() | |
if len(sys.argv) > 1: | |
type = sys.argv[1] | |
else: | |
type = 'csv' | |
output_dir = create_folder() | |
for extractor_id in extractor_ids: | |
extractor_name = get_extractor_name(extractor_id) | |
crawlruns = get_extractor_crawlruns(extractor_id) | |
for crawlrun in crawlruns: | |
crawl_id = crawlrun['fields']['guid'] | |
crawl_state = crawlrun['fields']['state'] | |
if crawl_state == 'FINISHED': | |
crawl_filename = generate_filename(extractor_name, crawlrun) | |
output_path = os.path.join(output_dir, crawl_filename) | |
if type == 'csv': | |
download_csv(crawl_id, output_path) | |
elif type == 'xlsx': | |
download_excel(crawl_id, output_path) | |
else: | |
print('Incorrect type defined: ' + type) | |
print(crawl_filename + ' saved to ' + output_dir) | |
else: | |
print(crawl_id + ' attachment not downloaded.') | |
print('All files saved.') | |
return | |
if __name__ == '__main__': | |
main(EXTRACTORS_FILE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment