Skip to content

Instantly share code, notes, and snippets.

@rfmcnally
Created March 8, 2018 00:18
Show Gist options
  • Save rfmcnally/a0f5326d05ca993da659b4e4a04400c3 to your computer and use it in GitHub Desktop.
Save rfmcnally/a0f5326d05ca993da659b4e4a04400c3 to your computer and use it in GitHub Desktop.
Script for downloading all CSV or Excel files from a list of Import.io Extractors
""" Script for automatically downloading attachments from Import.io API """
import os
import sys
import csv
from datetime import datetime
import requests
EXTRACTORS_FILE = 'importio_extractors.txt'
def create_folder():
""" Function for creating output folder from current time """
current_time = datetime.now().strftime("%m-%d-%yT%H-%M")
output_dir = '{0}_output'.format(current_time)
os.makedirs(output_dir)
return output_dir
def get_extractor_name(extractor_id):
""" Function for getting name of a given extractor """
url = 'https://store.import.io/extractor/{0}'.format(extractor_id)
querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
resp = requests.get(url, params=querystring)
name = resp.json()['name']
return name
def get_extractor_crawlruns(extractor_id):
""" Function for getting list of crawlruns from a given extractor """
url = 'https://store.import.io/crawlrun/_search'
querystring = {'_sort': "_meta.creationTimestamp", '_page': 1, '_perpage': 1000, 'extractorId': extractor_id, '_apikey': os.environ['IMPORT_IO_API_KEY']}
resp = requests.get(url, params=querystring)
crawlruns = resp.json()['hits']['hits']
return crawlruns
def generate_filename(extractor_name, crawlrun):
""" Function for generating filename from extractor name and crawlrun time """
stamp = datetime.fromtimestamp(int(crawlrun['fields']['stoppedAt']) / 1000).strftime("%m-%d-%yT%H-%M")
filename = "{filename}_{timestamp}".format(filename=extractor_name, timestamp=stamp)
return filename
def download_attachment(type, object_id, attachment):
""" Function for downloading an attachment from the object store """
url = 'https://store.import.io/{0}/{1}/_attachment/{2}'.format(type, object_id, attachment)
querystring = {'_apikey': os.environ['IMPORT_IO_API_KEY']}
resp = requests.get(url, params=querystring)
return resp
def download_csv(crawl_id, output_path):
""" Function for downloading CSV attachment from crawlrun """
resp = download_attachment('crawlrun', crawl_id, 'csv')
csv_resp = resp.content.decode('utf-8-sig')
csv_output = csv_resp.splitlines()
reader = csv.reader(csv_output, delimiter=',')
with open(output_path + '.csv', 'w', encoding='utf-8-sig') as output:
writer = csv.writer(output, delimiter=',')
for row in reader:
writer.writerow(row)
return
def download_excel(crawl_id, output_path):
""" Function for downloading Excel attachment from crawlrun """
resp = download_attachment('crawlrun', crawl_id, 'xlsx')
xlsx_output = open(output_path + '.xlsx', 'wb')
xlsx_output.write(resp.content)
xlsx_output.close()
return
def main(text_file):
""" Function for downloading files from list of extractor IDs"""
extractor_ids = open(text_file).read().splitlines()
if len(sys.argv) > 1:
type = sys.argv[1]
else:
type = 'csv'
output_dir = create_folder()
for extractor_id in extractor_ids:
extractor_name = get_extractor_name(extractor_id)
crawlruns = get_extractor_crawlruns(extractor_id)
for crawlrun in crawlruns:
crawl_id = crawlrun['fields']['guid']
crawl_state = crawlrun['fields']['state']
if crawl_state == 'FINISHED':
crawl_filename = generate_filename(extractor_name, crawlrun)
output_path = os.path.join(output_dir, crawl_filename)
if type == 'csv':
download_csv(crawl_id, output_path)
elif type == 'xlsx':
download_excel(crawl_id, output_path)
else:
print('Incorrect type defined: ' + type)
print(crawl_filename + ' saved to ' + output_dir)
else:
print(crawl_id + ' attachment not downloaded.')
print('All files saved.')
return
if __name__ == '__main__':
main(EXTRACTORS_FILE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment