Skip to content

Instantly share code, notes, and snippets.

@awforsythe
Last active December 21, 2015 08:19
Show Gist options
  • Save awforsythe/6277652 to your computer and use it in GitHub Desktop.
Save awforsythe/6277652 to your computer and use it in GitHub Desktop.
Scrapes commenter usernames from a YouTube video using BeautifulSoup and the all_comments URL.
'''
scrape_commenters.py
by Alex Forsythe
19 August 2013
Uses the youtube all_comments page together with BeautifulSoup (not included in
the standard library) to generate a list of the users who have commented on a
specific video. Since YouTube aggressively throttles responses to all_comments,
it may take multiple attempts (separated by a lengthy timeout) to pull down all
comments.
1. Place this script in the desired directory.
2. Ensure that you have BeautifulSoup (bs4) installed for Python 2.
3. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
4. Open a terminal and browse to the directory containing this script.
5. Run 'python scrape_commenters.py' and wait.
6. Names will be written to commenters.txt in the same directory.
'''
import sys
import math
import time
import random
import urllib2
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
kVideoId = r'VIDEO_ID_HERE'
kPrintProgressToConsole = True
kRetryIntervalSeconds = 30.0
def log_partial(verbose, s):
'''
If called in verbose mode, writes a line to the screen without a
terminating newline.
'''
if verbose:
sys.stdout.write(s)
def log(verbose, s):
'''
If called in verbose mode, writes a line to the screen with a terminating
newline.
'''
if verbose:
print s
class Video(object):
'''
Represents a YouTube video with a particular ID. The requisite video
metadata is accessed using the v2 API (no developer key required).
'''
def __init__(self, videoId):
'''
Initializes a new video with the given id, fetching the necessary
metadata from the API.
'''
self._id = videoId
# Pull down the XML from the API and parse it to find the total number
# of comments and page requests
xml = self._getXml()
comments = xml.find('{http://schemas.google.com/g/2005}comments')
feedLink = comments.find('{http://schemas.google.com/g/2005}feedLink')
self._numComments = int(feedLink.attrib['countHint'])
self._totalPages = int(math.ceil(float(self._numComments) / CommentsPage.kMaxCommentsPerPage))
def getNumComments(self):
'''
Returns the total number of comments made on this video.
'''
return self._numComments
def getTotalPages(self):
'''
Returns the total number of page requests that will need to be made in
order to read all the comments associated with this video.
'''
return self._totalPages
def getExpectedCommentCount(self, pageNumber):
'''
Returns the number of comments that we expect to be present on the
given page, taking the total number of comments into account.
'''
if pageNumber < 1 or pageNumber > self._totalPages:
return 0
return (self._numComments % CommentsPage.kMaxCommentsPerPage) if (pageNumber * CommentsPage.kMaxCommentsPerPage > self._numComments) else CommentsPage.kMaxCommentsPerPage
def _getXml(self):
'''
Initiates an HTTP request for this videos data and returns the
resulting XML, parsed into an ElementTree hierarchy.
'''
url = 'https://gdata.youtube.com/feeds/api/videos/%s' % self._id
res = urllib2.urlopen(url)
return ET.fromstring(res.read())
class Commenter(object):
'''
Represents a user who has made a comment on a YouTube video.
'''
def __init__(self, authorElement):
'''
Initializes a new Commenter data structure from the given HTML element
(the a tag under span.author for a particular comment).
'''
self._displayName = authorElement.text
self._channelName = authorElement['href'].split('/')[-1]
self._channelIsLegible = authorElement['href'].startswith('/user')
def __eq__(self, other):
'''
Overridden so that equality tests treat Commenter as a plain old data
structure.
'''
return type(self) is type(other) and self.__dict__ == other.__dict__
def __ne__(self, other):
'''
Overridden so that equality tests treat Commenter as a plain old data
structure.
'''
return not self.__eq__(other)
def __hash__(self):
'''
Overridden so that set membership tests treat Commenter as a plain old
data structure.
'''
return hash((self._displayName, self._channelName, self._channelIsLegible))
def __str__(self):
'''
Overridden to use the most sensible representation of the commenter's
display name and/or channel/user naem.
'''
if self._displayName == self._channelName:
return self._displayName.encode('utf-8')
elif self._channelIsLegible:
return '%s (%s)' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8'))
else:
return '%s <%s>' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8')[-4:])
class HtmlPage(object):
'''
Represents an HTML page received as a response to a URL request, and parsed
with BeautifulSoup.
'''
def __init__(self, url):
'''
Initializes an HtmlPage object from the given URL, initiating the
request, waiting for the response, and parsing it to a BeautifulSoup
tree.
'''
self._soup = BeautifulSoup(urllib2.urlopen(url))
class CommentsPage(HtmlPage):
'''
Represents the HTML page returned by accessing youtube.com/all_comments for
a particular video and comment page.
'''
kMaxCommentsPerPage = 500
def __init__(self, video, pageNumber):
'''
Initializes a comments page at the given page number (starting at 1,
non-zero-indexed) for the given video.
'''
# Form the URL, then make the request in the base class
super(CommentsPage, self).__init__('http://www.youtube.com/all_comments?v=%s&page=%d' % (video._id, pageNumber))
# Check the number of comment elements in the HTML response
comments = self._listCommentElements()
self._numComments = len(comments)
# Parse those elements into a set of Commenter data structures
commenters = {self._getCommenter(c) for c in comments}
self._commenters = {c for c in commenters if c}
def getNumComments(self):
'''
Returns the total number of comment elements contained in the HTML
response for this comment page.
'''
return self._numComments
def getNumCommenters(self):
'''
Returns the total number of unique commenters parsed from this page's
comments.
'''
return len(self._commenters)
def getCommenters(self):
'''
Returns the set of Commenter objects representing the users whose
comments are found on this page.
'''
return self._commenters
def _listCommentElements(self):
'''
Returns all comment elements contained on this page.
'''
return self._soup.select('ul#all-comments > li.comment')
def _getCommenter(self, commentElement):
'''
Creates and returns a new Commenter object for the given commenter
element (li.comment under #all-comments). If the commenter element is
invalid (e.g. for a deleted comment), returns None.
'''
try:
authorElement = commentElement.select('span.author > a')[0]
return Commenter(authorElement)
except IndexError:
return None
def scrape_comments(videoId, verbose):
'''
Returns a set of Commenter objects representing all the users who made
comments on the video with the given ID. If verbose is True, prints
progress messages to the console.
'''
# Create a Video object from the video ID
video = Video(videoId)
# Announce the start of the scraping process
log(verbose, 'Scraping comments for video with ID < %s >...' % videoId)
log(verbose, 'Total comments: %d (Page requests: %d)' % (video.getNumComments(), video.getTotalPages()))
commenters = set()
pageNumber = 1
while True:
# Loop until the next page is expected to contain 0 comments
expectedNumComments = video.getExpectedCommentCount(pageNumber)
if expectedNumComments <= 0:
break
# Make the request for the page and parse the result
log_partial(verbose, 'Getting comments from page %d... ' % pageNumber)
page = CommentsPage(video, pageNumber)
if page.getNumComments() <= 1 and expectedNumComments > 1:
# Retry if we unexpectedly get only one commment
log(verbose, 'only got %d. Retrying in %d seconds...' % (page.getNumComments(), kRetryIntervalSeconds))
time.sleep(kRetryIntervalSeconds)
else:
# If we get more than one comment, extend the set and continue
log(verbose, 'got %d! (unique: %d)' % (page.getNumComments(), page.getNumCommenters()))
commenters |= page.getCommenters()
pageNumber += 1
log(verbose, 'Finished scraping comments.')
return commenters
def write_commenters(commenters, filename, verbose):
'''
Writes the given set of commenters to a plain text file at the given
filename. If verbose is True, prints status messages to the console.
'''
# Write the collected names to a text file
log_partial(verbose, 'Writing usernames to %s... ' % filename)
with open(filename, 'w') as fp:
for commenter in commenters:
fp.write('%s\n' % commenter)
log(verbose, 'finished.')
if __name__ == '__main__':
if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
print 'Set the value of kVideoId on line 28!'
else:
commenters = scrape_comments(kVideoId, kPrintProgressToConsole)
write_commenters(commenters, 'commenters.txt', kPrintProgressToConsole)
log(kPrintProgressToConsole, 'Randomly chosen commenter: %s' % (random.choice(list(commenters))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment