Skip to content

Instantly share code, notes, and snippets.

@LouiseMcMahon
Created September 22, 2016 17:54
Show Gist options
  • Save LouiseMcMahon/7ffb27c7d9b4244f87fa2821b593588c to your computer and use it in GitHub Desktop.
Save LouiseMcMahon/7ffb27c7d9b4244f87fa2821b593588c to your computer and use it in GitHub Desktop.
Run scrapy email output and S3 logs script
#!/usr/bin/env python
import os
import boto3
import datetime
import time
#wait 5 minutes
time.sleep(60*5)
#set file names
itemFileName = 'Crawl-Results-' + datetime.date.today().strftime("%d%m%y") + '.csv'
logFileName = 'Crawl-Results-' + datetime.date.today().strftime("%d%m%y") + '.log'
#run crawl
os.system('scrapy crawl spiderName -o ' + itemFileName + ' --logfile ' + logFileName)
#upload item file and logFile
s3Client = boto3.client('s3', region_name='eu-west-1')
bucketName = 'bucketName'
with open(itemFileName, 'rb') as data:
s3Client.put_object(Bucket=bucketName, Key='item-csv/' + itemFileName, Body=data, ContentType='text/plain', StorageClass='REDUCED_REDUNDANCY')
fileURL = 'https://s3-eu-west-1.amazonaws.com/'+bucketName+'/item-csv/' + itemFileName
with open(logFileName, 'rb') as data:
s3Client.put_object(Bucket=bucketName, Key='crawler-logs/' + logFileName, Body=data, ContentType='text/plain', StorageClass='REDUCED_REDUNDANCY')
#send email
sesClient = boto3.client('ses', region_name='eu-west-1')
sesClient.send_email(
Source='norepply@email.co.uk',
Destination={
'ToAddresses': [
'email@email.co.uk'
]
},
Message={
'Subject': {
'Data': 'Crawl Output'
},
'Body': {
'Text': {
'Data': fileURL
}
}
},
ReplyToAddresses=[
'email@email.co.uk',
]
)
#delete files
os.remove(itemFileName)
os.remove(logFileName)
#shutown instace
os.system("shutdown now -h")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment