Skip to content

Instantly share code, notes, and snippets.

@IngoKl
Last active January 5, 2016 19:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IngoKl/3ebd968ef3c6da406638 to your computer and use it in GitHub Desktop.
Save IngoKl/3ebd968ef3c6da406638 to your computer and use it in GitHub Desktop.
A python script that automatically downloads all comments from a theguardian.com article
import urllib2
import re
import time
import sys
# This script will download all comments from a Guardian (theguardian.com) article into a single .html file
# Use 'File ID' to easily identify your file later (this could also be something like a Citavi identifier)
# Ingo Kleiber <ikleiber@googlemail.com>
print """\
_____ _ _ _____ _
/ ____| | (_) | __ \| |
| | __ _ _ __ _ _ __ __| |_ __ _ _ __ | | | | |
| | |_ | | | |/ _` | '__/ _` | |/ _` | '_ \| | | | |
| |__| | |_| | (_| | | | (_| | | (_| | | | | |__| | |____
\_____|\__,_|\__,_|_| \__,_|_|\__,_|_| |_|_____/|______|
"""
print 'Guardian Comment Downloader v.1.1 - Ingo Kleiber (05.01.2015)\n'
guardianLink = raw_input ('> Guadian Link: ')
fileId = raw_input('> File ID: ')
#Figuring out the PageId
pageResponse = urllib2.urlopen(guardianLink)
commentsMatch = re.search(r'/p/(.*?)"', pageResponse.read(), re.M|re.I)
if commentsMatch:
pageId = commentsMatch.group(1)
print '[+] pageId has been retrieved ('+pageId+')'
else:
sys.exit('[-] Could not retrieve pageId!')
#Retrieving comments
downloadCount = 1
downloadError = 0
startTime = time.time()
fileName = 'GuardianDownload_' + pageId + '_' + str(startTime) + '_' + fileId + '.html'
f = open(fileName, 'a')
print '[~] Start downloading pages ...'
while downloadError == 0:
try:
response = urllib2.urlopen('http://www.theguardian.com/discussion/p/' + pageId + '?page=' + str(downloadCount))
html = response.read()
f.write(html)
print '[+] Page: ' + str(downloadCount) + ' has been downloaded!'
downloadCount = downloadCount + 1
except:
downloadError = 1
stopTime = time.time()
runningTime = (stopTime - startTime)/60;
print '[=] ' + str(downloadCount-1) + ' pages have been downloaded in ' + str(round(runningTime,2)) + ' minutes'
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment