IngoKl/GuardianCommentDownloader.py

## GuardianCommentDownloader.py
import urllib2
import re
import time
import sys

# This script will download all comments from a Guardian (theguardian.com) article into a single .html file
# Use 'File ID' to easily identify your file later (this could also be something like a Citavi identifier)
# Ingo Kleiber <ikleiber@googlemail.com>

print """\
   _____                     _ _             _____  _
  / ____|                   | (_)           |  __ \| |
 | |  __ _   _  __ _ _ __ __| |_  __ _ _ __ | |  | | |
 | | |_ | | | |/ _` | '__/ _` | |/ _` | '_ \| |  | | |
 | |__| | |_| | (_| | | | (_| | | (_| | | | | |__| | |____
  \_____|\__,_|\__,_|_|  \__,_|_|\__,_|_| |_|_____/|______|

"""

print 'Guardian Comment Downloader v.1.1 - Ingo Kleiber (05.01.2015)\n'
guardianLink = raw_input ('> Guadian Link: ')
fileId = raw_input('> File ID: ')

#Figuring out the PageId
pageResponse = urllib2.urlopen(guardianLink)
commentsMatch = re.search(r'/p/(.*?)"', pageResponse.read(), re.M|re.I)
if commentsMatch:
	pageId = commentsMatch.group(1)
	print '[+] pageId has been retrieved ('+pageId+')'
else:
	sys.exit('[-] Could not retrieve pageId!')

#Retrieving comments
downloadCount = 1
downloadError = 0

startTime = time.time()
fileName = 'GuardianDownload_' + pageId + '_' + str(startTime) + '_' + fileId + '.html'
f = open(fileName, 'a')

print '[~] Start downloading pages ...'
while downloadError == 0:
	try:
		response = urllib2.urlopen('http://www.theguardian.com/discussion/p/' + pageId + '?page=' + str(downloadCount))
		html = response.read()
		f.write(html)
		print '[+] Page: ' + str(downloadCount) + ' has been downloaded!'
		downloadCount = downloadCount + 1
	except:
		downloadError = 1

stopTime = time.time()
runningTime = (stopTime - startTime)/60;
print '[=] ' + str(downloadCount-1) + ' pages have been downloaded in ' + str(round(runningTime,2)) + ' minutes'
f.close()
	import urllib2
	import re
	import time
	import sys

	# This script will download all comments from a Guardian (theguardian.com) article into a single .html file
	# Use 'File ID' to easily identify your file later (this could also be something like a Citavi identifier)
	# Ingo Kleiber <ikleiber@googlemail.com>

	print """\
	_____ _ _ _____ _
	/ ____\| \| (_) \| __ \\| \|
	\| \| __ _ _ __ _ _ __ __\| \|_ __ _ _ __ \| \| \| \| \|
	\| \| \|_ \| \| \| \|/ _` \| '__/ _` \| \|/ _` \| '_ \\| \| \| \| \|
	\| \|__\| \| \|_\| \| (_\| \| \| \| (_\| \| \| (_\| \| \| \| \| \|__\| \| \|____
	\_____\|\__,_\|\__,_\|_\| \__,_\|_\|\__,_\|_\| \|_\|_____/\|______\|

	"""

	print 'Guardian Comment Downloader v.1.1 - Ingo Kleiber (05.01.2015)\n'
	guardianLink = raw_input ('> Guadian Link: ')
	fileId = raw_input('> File ID: ')

	#Figuring out the PageId
	pageResponse = urllib2.urlopen(guardianLink)
	commentsMatch = re.search(r'/p/(.*?)"', pageResponse.read(), re.M\|re.I)
	if commentsMatch:
	pageId = commentsMatch.group(1)
	print '[+] pageId has been retrieved ('+pageId+')'
	else:
	sys.exit('[-] Could not retrieve pageId!')

	#Retrieving comments
	downloadCount = 1
	downloadError = 0

	startTime = time.time()
	fileName = 'GuardianDownload_' + pageId + '_' + str(startTime) + '_' + fileId + '.html'
	f = open(fileName, 'a')

	print '[~] Start downloading pages ...'
	while downloadError == 0:
	try:
	response = urllib2.urlopen('http://www.theguardian.com/discussion/p/' + pageId + '?page=' + str(downloadCount))
	html = response.read()
	f.write(html)
	print '[+] Page: ' + str(downloadCount) + ' has been downloaded!'
	downloadCount = downloadCount + 1
	except:
	downloadError = 1

	stopTime = time.time()
	runningTime = (stopTime - startTime)/60;
	print '[=] ' + str(downloadCount-1) + ' pages have been downloaded in ' + str(round(runningTime,2)) + ' minutes'
	f.close()