-
-
Save lungati/c6ace874232371bcdf01 to your computer and use it in GitHub Desktop.
Download Kenyan Daily Nation and Business Daily
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/env python | |
# coding: utf-8 | |
import requests | |
from datetime import date, timedelta | |
from os.path import isfile, expanduser, exists, join | |
from os import makedirs | |
def getDateSuffix(t): | |
if 4 <= t.day <= 20 or 24 <= t.day <= 30: | |
return "th" | |
else: | |
return ["st", "nd", "rd"][t.day % 10 - 1] | |
def setFileDetails(today): | |
print "Downloading for day: %s" % (today) | |
suffix = getDateSuffix(today) | |
fdate = "%s %s%s %s.pdf" % (today.strftime('%b'),today.strftime('%d').lstrip('0'),suffix,today.strftime('%Y')) | |
#Web urls | |
downloadfile1 = join("http://downloads.realviewtechnologies.com/Nation Media/Daily Nation",fdate) | |
downloadfile2 = join("http://downloads.realviewtechnologies.com/Nation Media/Business Daily",fdate) | |
#File urls | |
dl_folder = expanduser('~/Downloads/newspapers') | |
if not exists(dl_folder): | |
makedirs(dl_folder) | |
nation_file = join(dl_folder,'DailyNation%s' % fdate).replace(" ","") | |
bdaily_file = join(dl_folder,'Bdaily%s' % fdate).replace(" ","") | |
'''Checks whether the file exists''' | |
if(isfile(nation_file)): | |
print "File already downloaded. %s" % nation_file | |
setFileDetails(today - timedelta(days=1)) | |
#Check the headers for content length | |
#N.B: Requests auto-html-encodes our URLS for the spaces :) | |
hdr1 = requests.head(downloadfile1).headers | |
hdr2 = requests.head(downloadfile2).headers | |
url1_length = hdr1['content-length'] | |
url2_length = hdr2['content-length'] | |
#Stream = True doesn't begin the download till you call its methods | |
req_nation = requests.get(downloadfile1, stream = True) | |
req_bdaily = requests.get(downloadfile2, stream = True) | |
#Download Nation | |
download("Daily Nation", url1_length, nation_file, req_nation) | |
#Download Bdaily | |
download("Business Daily", url2_length, bdaily_file, req_bdaily) | |
#Keep checking for older newspapers, go back a day till there's no more content | |
setFileDetails(today) | |
def download(str_file_dl, content_length, file_url, request_instance): | |
#Best way to download files is to write chunks to file | |
try: | |
print "Start downloading "+str_file_dl | |
if content_length > 0: | |
file_sz = 0 | |
f = open(file_url, 'wb') | |
print "Downloading: %s bytes: %s" % (file_url, content_length) | |
while True: | |
buffer = request_instance.raw.read(8192) | |
if not buffer: | |
break | |
file_sz += len(buffer) | |
f.write(buffer) | |
status = r"%10d [%3.2f%%]" % (file_sz, file_sz * 100. / float(content_length)) | |
status = status + chr(8)*(len(status)+1) | |
print status, | |
f.close() | |
print(str_file_dl+" Complete") | |
except Exception, err: | |
print("Error Downloading "+str_file_dl, err) | |
def scrapContent(): | |
#Looking for keywords 'Beautiful Kenya' in Daily Nation and in Business Daily 'BD Life: Travel Special' | |
#TODO Header accept-ranges allows downloads to be resumed!! | |
pass | |
if __name__ == '__main__': | |
print "Start downloads" | |
setFileDetails(date.today()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment