lungati/download_paper.py

## download_paper.py
#!/usr/env python
# coding: utf-8

import requests
from datetime import date, timedelta
from os.path import isfile, expanduser, exists, join
from os import makedirs

def getDateSuffix(t):
    if 4 <= t.day <= 20 or 24 <= t.day <= 30:
        return "th"
    else:
        return ["st", "nd", "rd"][t.day % 10 - 1]

def setFileDetails(today):
    print "Downloading for day: %s" % (today)
    suffix = getDateSuffix(today)
    fdate = "%s %s%s %s.pdf" % (today.strftime('%b'),today.strftime('%d').lstrip('0'),suffix,today.strftime('%Y'))
    #Web urls
    downloadfile1 = join("http://downloads.realviewtechnologies.com/Nation Media/Daily Nation",fdate)
    downloadfile2 = join("http://downloads.realviewtechnologies.com/Nation Media/Business Daily",fdate)
    #File urls
    dl_folder = expanduser('~/Downloads/newspapers')
    if not exists(dl_folder):
        makedirs(dl_folder)
    nation_file = join(dl_folder,'DailyNation%s' % fdate).replace(" ","")
    bdaily_file = join(dl_folder,'Bdaily%s' % fdate).replace(" ","")
    '''Checks whether the file exists'''
    if(isfile(nation_file)):
        print "File already downloaded. %s" % nation_file
        setFileDetails(today - timedelta(days=1))

    #Check the headers for content length
    #N.B: Requests auto-html-encodes our URLS for the spaces :)
    hdr1 = requests.head(downloadfile1).headers
    hdr2 = requests.head(downloadfile2).headers
    url1_length = hdr1['content-length']
    url2_length = hdr2['content-length']

    #Stream = True doesn't begin the download till you call its methods
    req_nation = requests.get(downloadfile1, stream = True)
    req_bdaily = requests.get(downloadfile2, stream = True)
    #Download Nation
    download("Daily Nation", url1_length, nation_file, req_nation)
    #Download Bdaily
    download("Business Daily", url2_length, bdaily_file, req_bdaily)
    #Keep checking for older newspapers, go back a day till there's no more content
    setFileDetails(today)


def download(str_file_dl, content_length, file_url, request_instance):
    #Best way to download files is to write chunks to file
    try:
        print "Start downloading "+str_file_dl
        if content_length > 0:
            file_sz = 0
            f = open(file_url, 'wb')
            print "Downloading: %s bytes: %s" % (file_url, content_length)
            while True:
                buffer = request_instance.raw.read(8192)
                if not buffer:
                    break
                file_sz += len(buffer)
                f.write(buffer)
                status = r"%10d  [%3.2f%%]" % (file_sz, file_sz * 100. / float(content_length))
                status = status + chr(8)*(len(status)+1)
                print status,
            f.close()
            print(str_file_dl+" Complete")
    except Exception, err:
        print("Error Downloading "+str_file_dl, err)

def scrapContent():
    #Looking for keywords 'Beautiful Kenya' in Daily Nation and in Business Daily 'BD Life: Travel Special'
    #TODO Header accept-ranges allows downloads to be resumed!!
    pass

if __name__ == '__main__':
    print "Start downloads"
    setFileDetails(date.today())
	#!/usr/env python
	# coding: utf-8

	import requests
	from datetime import date, timedelta
	from os.path import isfile, expanduser, exists, join
	from os import makedirs

	def getDateSuffix(t):
	if 4 <= t.day <= 20 or 24 <= t.day <= 30:
	return "th"
	else:
	return ["st", "nd", "rd"][t.day % 10 - 1]

	def setFileDetails(today):
	print "Downloading for day: %s" % (today)
	suffix = getDateSuffix(today)
	fdate = "%s %s%s %s.pdf" % (today.strftime('%b'),today.strftime('%d').lstrip('0'),suffix,today.strftime('%Y'))
	#Web urls
	downloadfile1 = join("http://downloads.realviewtechnologies.com/Nation Media/Daily Nation",fdate)
	downloadfile2 = join("http://downloads.realviewtechnologies.com/Nation Media/Business Daily",fdate)
	#File urls
	dl_folder = expanduser('~/Downloads/newspapers')
	if not exists(dl_folder):
	makedirs(dl_folder)
	nation_file = join(dl_folder,'DailyNation%s' % fdate).replace(" ","")
	bdaily_file = join(dl_folder,'Bdaily%s' % fdate).replace(" ","")
	'''Checks whether the file exists'''
	if(isfile(nation_file)):
	print "File already downloaded. %s" % nation_file
	setFileDetails(today - timedelta(days=1))

	#Check the headers for content length
	#N.B: Requests auto-html-encodes our URLS for the spaces :)
	hdr1 = requests.head(downloadfile1).headers
	hdr2 = requests.head(downloadfile2).headers
	url1_length = hdr1['content-length']
	url2_length = hdr2['content-length']

	#Stream = True doesn't begin the download till you call its methods
	req_nation = requests.get(downloadfile1, stream = True)
	req_bdaily = requests.get(downloadfile2, stream = True)
	#Download Nation
	download("Daily Nation", url1_length, nation_file, req_nation)
	#Download Bdaily
	download("Business Daily", url2_length, bdaily_file, req_bdaily)
	#Keep checking for older newspapers, go back a day till there's no more content
	setFileDetails(today)


	def download(str_file_dl, content_length, file_url, request_instance):
	#Best way to download files is to write chunks to file
	try:
	print "Start downloading "+str_file_dl
	if content_length > 0:
	file_sz = 0
	f = open(file_url, 'wb')
	print "Downloading: %s bytes: %s" % (file_url, content_length)
	while True:
	buffer = request_instance.raw.read(8192)
	if not buffer:
	break
	file_sz += len(buffer)
	f.write(buffer)
	status = r"%10d [%3.2f%%]" % (file_sz, file_sz * 100. / float(content_length))
	status = status + chr(8)*(len(status)+1)
	print status,
	f.close()
	print(str_file_dl+" Complete")
	except Exception, err:
	print("Error Downloading "+str_file_dl, err)

	def scrapContent():
	#Looking for keywords 'Beautiful Kenya' in Daily Nation and in Business Daily 'BD Life: Travel Special'
	#TODO Header accept-ranges allows downloads to be resumed!!
	pass

	if __name__ == '__main__':
	print "Start downloads"
	setFileDetails(date.today())