Skip to content

Instantly share code, notes, and snippets.

@lungati
Forked from ninenine/download_paper.py
Last active January 11, 2016 09:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lungati/c6ace874232371bcdf01 to your computer and use it in GitHub Desktop.
Save lungati/c6ace874232371bcdf01 to your computer and use it in GitHub Desktop.
Download Kenyan Daily Nation and Business Daily
#!/usr/env python
# coding: utf-8
import requests
from datetime import date, timedelta
from os.path import isfile, expanduser, exists, join
from os import makedirs
def getDateSuffix(t):
if 4 <= t.day <= 20 or 24 <= t.day <= 30:
return "th"
else:
return ["st", "nd", "rd"][t.day % 10 - 1]
def setFileDetails(today):
print "Downloading for day: %s" % (today)
suffix = getDateSuffix(today)
fdate = "%s %s%s %s.pdf" % (today.strftime('%b'),today.strftime('%d').lstrip('0'),suffix,today.strftime('%Y'))
#Web urls
downloadfile1 = join("http://downloads.realviewtechnologies.com/Nation Media/Daily Nation",fdate)
downloadfile2 = join("http://downloads.realviewtechnologies.com/Nation Media/Business Daily",fdate)
#File urls
dl_folder = expanduser('~/Downloads/newspapers')
if not exists(dl_folder):
makedirs(dl_folder)
nation_file = join(dl_folder,'DailyNation%s' % fdate).replace(" ","")
bdaily_file = join(dl_folder,'Bdaily%s' % fdate).replace(" ","")
'''Checks whether the file exists'''
if(isfile(nation_file)):
print "File already downloaded. %s" % nation_file
setFileDetails(today - timedelta(days=1))
#Check the headers for content length
#N.B: Requests auto-html-encodes our URLS for the spaces :)
hdr1 = requests.head(downloadfile1).headers
hdr2 = requests.head(downloadfile2).headers
url1_length = hdr1['content-length']
url2_length = hdr2['content-length']
#Stream = True doesn't begin the download till you call its methods
req_nation = requests.get(downloadfile1, stream = True)
req_bdaily = requests.get(downloadfile2, stream = True)
#Download Nation
download("Daily Nation", url1_length, nation_file, req_nation)
#Download Bdaily
download("Business Daily", url2_length, bdaily_file, req_bdaily)
#Keep checking for older newspapers, go back a day till there's no more content
setFileDetails(today)
def download(str_file_dl, content_length, file_url, request_instance):
#Best way to download files is to write chunks to file
try:
print "Start downloading "+str_file_dl
if content_length > 0:
file_sz = 0
f = open(file_url, 'wb')
print "Downloading: %s bytes: %s" % (file_url, content_length)
while True:
buffer = request_instance.raw.read(8192)
if not buffer:
break
file_sz += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_sz, file_sz * 100. / float(content_length))
status = status + chr(8)*(len(status)+1)
print status,
f.close()
print(str_file_dl+" Complete")
except Exception, err:
print("Error Downloading "+str_file_dl, err)
def scrapContent():
#Looking for keywords 'Beautiful Kenya' in Daily Nation and in Business Daily 'BD Life: Travel Special'
#TODO Header accept-ranges allows downloads to be resumed!!
pass
if __name__ == '__main__':
print "Start downloads"
setFileDetails(date.today())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment