awforsythe/scrape_commenters.py

## scrape_commenters.py
'''
scrape_commenters.py
by Alex Forsythe
19 August 2013

Uses the youtube all_comments page together with BeautifulSoup (not included in
the standard library) to generate a list of the users who have commented on a
specific video. Since YouTube aggressively throttles responses to all_comments,
it may take multiple attempts (separated by a lengthy timeout) to pull down all
comments.

1. Place this script in the desired directory.
2. Ensure that you have BeautifulSoup (bs4) installed for Python 2.
3. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
4. Open a terminal and browse to the directory containing this script.
5. Run 'python scrape_commenters.py' and wait.
6. Names will be written to commenters.txt in the same directory.
'''

import sys
import math
import time
import random
import urllib2
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

kVideoId = r'VIDEO_ID_HERE'
kPrintProgressToConsole = True
kRetryIntervalSeconds = 30.0

def log_partial(verbose, s):
    '''
    If called in verbose mode, writes a line to the screen without a
    terminating newline.
    '''
    if verbose:
        sys.stdout.write(s)

def log(verbose, s):
    '''
    If called in verbose mode, writes a line to the screen with a terminating
    newline.
    '''
    if verbose:
        print s

class Video(object):
    '''
    Represents a YouTube video with a particular ID. The requisite video
    metadata is accessed using the v2 API (no developer key required).
    '''

    def __init__(self, videoId):
        '''
        Initializes a new video with the given id, fetching the necessary
        metadata from the API.
        '''
        self._id = videoId

        # Pull down the XML from the API and parse it to find the total number
        # of comments and page requests
        xml = self._getXml()
        comments = xml.find('{http://schemas.google.com/g/2005}comments')
        feedLink = comments.find('{http://schemas.google.com/g/2005}feedLink')
        self._numComments = int(feedLink.attrib['countHint'])
        self._totalPages = int(math.ceil(float(self._numComments) / CommentsPage.kMaxCommentsPerPage))

    def getNumComments(self):
        '''
        Returns the total number of comments made on this video.
        '''
        return self._numComments

    def getTotalPages(self):
        '''
        Returns the total number of page requests that will need to be made in
        order to read all the comments associated with this video.
        '''
        return self._totalPages

    def getExpectedCommentCount(self, pageNumber):
        '''
        Returns the number of comments that we expect to be present on the
        given page, taking the total number of comments into account.
        '''
        if pageNumber < 1 or pageNumber > self._totalPages:
            return 0

        return (self._numComments % CommentsPage.kMaxCommentsPerPage) if (pageNumber * CommentsPage.kMaxCommentsPerPage > self._numComments) else CommentsPage.kMaxCommentsPerPage

    def _getXml(self):
        '''
        Initiates an HTTP request for this videos data and returns the
        resulting XML, parsed into an ElementTree hierarchy.
        '''
        url = 'https://gdata.youtube.com/feeds/api/videos/%s' % self._id
        res = urllib2.urlopen(url)
        return ET.fromstring(res.read())

class Commenter(object):
    '''
    Represents a user who has made a comment on a YouTube video.
    '''

    def __init__(self, authorElement):
        '''
        Initializes a new Commenter data structure from the given HTML element
        (the a tag under span.author for a particular comment).
        '''
        self._displayName = authorElement.text
        self._channelName = authorElement['href'].split('/')[-1]
        self._channelIsLegible = authorElement['href'].startswith('/user')

    def __eq__(self, other):
        '''
        Overridden so that equality tests treat Commenter as a plain old data
        structure.
        '''
        return type(self) is type(other) and self.__dict__ == other.__dict__

    def __ne__(self, other):
        '''
        Overridden so that equality tests treat Commenter as a plain old data
        structure.
        '''
        return not self.__eq__(other)

    def __hash__(self):
        '''
        Overridden so that set membership tests treat Commenter as a plain old
        data structure.
        '''
        return hash((self._displayName, self._channelName, self._channelIsLegible))

    def __str__(self):
        '''
        Overridden to use the most sensible representation of the commenter's
        display name and/or channel/user naem.
        '''
        if self._displayName == self._channelName:
            return self._displayName.encode('utf-8')
        elif self._channelIsLegible:
            return '%s (%s)' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8'))
        else:
            return '%s <%s>' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8')[-4:])

class HtmlPage(object):
    '''
    Represents an HTML page received as a response to a URL request, and parsed
    with BeautifulSoup.
    '''

    def __init__(self, url):
        '''
        Initializes an HtmlPage object from the given URL, initiating the
        request, waiting for the response, and parsing it to a BeautifulSoup
        tree.
        '''
        self._soup = BeautifulSoup(urllib2.urlopen(url))

class CommentsPage(HtmlPage):
    '''
    Represents the HTML page returned by accessing youtube.com/all_comments for
    a particular video and comment page.
    '''

    kMaxCommentsPerPage = 500

    def __init__(self, video, pageNumber):
        '''
        Initializes a comments page at the given page number (starting at 1,
        non-zero-indexed) for the given video.
        '''
        # Form the URL, then make the request in the base class
        super(CommentsPage, self).__init__('http://www.youtube.com/all_comments?v=%s&page=%d' % (video._id, pageNumber))

        # Check the number of comment elements in the HTML response
        comments = self._listCommentElements()
        self._numComments = len(comments)

        # Parse those elements into a set of Commenter data structures
        commenters = {self._getCommenter(c) for c in comments}
        self._commenters = {c for c in commenters if c}

    def getNumComments(self):
        '''
        Returns the total number of comment elements contained in the HTML
        response for this comment page.
        '''
        return self._numComments

    def getNumCommenters(self):
        '''
        Returns the total number of unique commenters parsed from this page's
        comments.
        '''
        return len(self._commenters)

    def getCommenters(self):
        '''
        Returns the set of Commenter objects representing the users whose
        comments are found on this page.
        '''
        return self._commenters

    def _listCommentElements(self):
        '''
        Returns all comment elements contained on this page.
        '''
        return self._soup.select('ul#all-comments > li.comment')

    def _getCommenter(self, commentElement):
        '''
        Creates and returns a new Commenter object for the given commenter
        element (li.comment under #all-comments). If the commenter element is
        invalid (e.g. for a deleted comment), returns None.
        '''
        try:
            authorElement = commentElement.select('span.author > a')[0]
            return Commenter(authorElement)
        except IndexError:
            return None

def scrape_comments(videoId, verbose):
    '''
    Returns a set of Commenter objects representing all the users who made
    comments on the video with the given ID. If verbose is True, prints
    progress messages to the console.
    '''
    # Create a Video object from the video ID
    video = Video(videoId)

    # Announce the start of the scraping process
    log(verbose, 'Scraping comments for video with ID < %s >...' % videoId)
    log(verbose, 'Total comments: %d (Page requests: %d)' % (video.getNumComments(), video.getTotalPages()))

    commenters = set()
    pageNumber = 1

    while True:

        # Loop until the next page is expected to contain 0 comments
        expectedNumComments = video.getExpectedCommentCount(pageNumber)
        if expectedNumComments <= 0:
            break

        # Make the request for the page and parse the result
        log_partial(verbose, 'Getting comments from page %d... ' % pageNumber)
        page = CommentsPage(video, pageNumber)

        if page.getNumComments() <= 1 and expectedNumComments > 1:

            # Retry if we unexpectedly get only one commment
            log(verbose, 'only got %d. Retrying in %d seconds...' % (page.getNumComments(), kRetryIntervalSeconds))
            time.sleep(kRetryIntervalSeconds)

        else:

            # If we get more than one comment, extend the set and continue
            log(verbose, 'got %d! (unique: %d)' % (page.getNumComments(), page.getNumCommenters()))
            commenters |= page.getCommenters()
            pageNumber += 1

    log(verbose, 'Finished scraping comments.')
    return commenters

def write_commenters(commenters, filename, verbose):
    '''
    Writes the given set of commenters to a plain text file at the given
    filename. If verbose is True, prints status messages to the console.
    '''
    # Write the collected names to a text file
    log_partial(verbose, 'Writing usernames to %s... ' % filename)
    with open(filename, 'w') as fp:
        for commenter in commenters:
            fp.write('%s\n' % commenter)
    log(verbose, 'finished.')

if __name__ == '__main__':

    if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
        print 'Set the value of kVideoId on line 28!'
    else:
        commenters = scrape_comments(kVideoId, kPrintProgressToConsole)
        write_commenters(commenters, 'commenters.txt', kPrintProgressToConsole)
        log(kPrintProgressToConsole, 'Randomly chosen commenter: %s' % (random.choice(list(commenters))))
	'''
	scrape_commenters.py
	by Alex Forsythe
	19 August 2013

	Uses the youtube all_comments page together with BeautifulSoup (not included in
	the standard library) to generate a list of the users who have commented on a
	specific video. Since YouTube aggressively throttles responses to all_comments,
	it may take multiple attempts (separated by a lengthy timeout) to pull down all
	comments.

	1. Place this script in the desired directory.
	2. Ensure that you have BeautifulSoup (bs4) installed for Python 2.
	3. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
	4. Open a terminal and browse to the directory containing this script.
	5. Run 'python scrape_commenters.py' and wait.
	6. Names will be written to commenters.txt in the same directory.
	'''

	import sys
	import math
	import time
	import random
	import urllib2
	import xml.etree.ElementTree as ET
	from bs4 import BeautifulSoup

	kVideoId = r'VIDEO_ID_HERE'
	kPrintProgressToConsole = True
	kRetryIntervalSeconds = 30.0

	def log_partial(verbose, s):
	'''
	If called in verbose mode, writes a line to the screen without a
	terminating newline.
	'''
	if verbose:
	sys.stdout.write(s)

	def log(verbose, s):
	'''
	If called in verbose mode, writes a line to the screen with a terminating
	newline.
	'''
	if verbose:
	print s

	class Video(object):
	'''
	Represents a YouTube video with a particular ID. The requisite video
	metadata is accessed using the v2 API (no developer key required).
	'''

	def __init__(self, videoId):
	'''
	Initializes a new video with the given id, fetching the necessary
	metadata from the API.
	'''
	self._id = videoId

	# Pull down the XML from the API and parse it to find the total number
	# of comments and page requests
	xml = self._getXml()
	comments = xml.find('{http://schemas.google.com/g/2005}comments')
	feedLink = comments.find('{http://schemas.google.com/g/2005}feedLink')
	self._numComments = int(feedLink.attrib['countHint'])
	self._totalPages = int(math.ceil(float(self._numComments) / CommentsPage.kMaxCommentsPerPage))

	def getNumComments(self):
	'''
	Returns the total number of comments made on this video.
	'''
	return self._numComments

	def getTotalPages(self):
	'''
	Returns the total number of page requests that will need to be made in
	order to read all the comments associated with this video.
	'''
	return self._totalPages

	def getExpectedCommentCount(self, pageNumber):
	'''
	Returns the number of comments that we expect to be present on the
	given page, taking the total number of comments into account.
	'''
	if pageNumber < 1 or pageNumber > self._totalPages:
	return 0

	return (self._numComments % CommentsPage.kMaxCommentsPerPage) if (pageNumber * CommentsPage.kMaxCommentsPerPage > self._numComments) else CommentsPage.kMaxCommentsPerPage

	def _getXml(self):
	'''
	Initiates an HTTP request for this videos data and returns the
	resulting XML, parsed into an ElementTree hierarchy.
	'''
	url = 'https://gdata.youtube.com/feeds/api/videos/%s' % self._id
	res = urllib2.urlopen(url)
	return ET.fromstring(res.read())

	class Commenter(object):
	'''
	Represents a user who has made a comment on a YouTube video.
	'''

	def __init__(self, authorElement):
	'''
	Initializes a new Commenter data structure from the given HTML element
	(the a tag under span.author for a particular comment).
	'''
	self._displayName = authorElement.text
	self._channelName = authorElement['href'].split('/')[-1]
	self._channelIsLegible = authorElement['href'].startswith('/user')

	def __eq__(self, other):
	'''
	Overridden so that equality tests treat Commenter as a plain old data
	structure.
	'''
	return type(self) is type(other) and self.__dict__ == other.__dict__

	def __ne__(self, other):
	'''
	Overridden so that equality tests treat Commenter as a plain old data
	structure.
	'''
	return not self.__eq__(other)

	def __hash__(self):
	'''
	Overridden so that set membership tests treat Commenter as a plain old
	data structure.
	'''
	return hash((self._displayName, self._channelName, self._channelIsLegible))

	def __str__(self):
	'''
	Overridden to use the most sensible representation of the commenter's
	display name and/or channel/user naem.
	'''
	if self._displayName == self._channelName:
	return self._displayName.encode('utf-8')
	elif self._channelIsLegible:
	return '%s (%s)' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8'))
	else:
	return '%s <%s>' % (self._displayName.encode('utf-8'), self._channelName.encode('utf-8')[-4:])

	class HtmlPage(object):
	'''
	Represents an HTML page received as a response to a URL request, and parsed
	with BeautifulSoup.
	'''

	def __init__(self, url):
	'''
	Initializes an HtmlPage object from the given URL, initiating the
	request, waiting for the response, and parsing it to a BeautifulSoup
	tree.
	'''
	self._soup = BeautifulSoup(urllib2.urlopen(url))

	class CommentsPage(HtmlPage):
	'''
	Represents the HTML page returned by accessing youtube.com/all_comments for
	a particular video and comment page.
	'''

	kMaxCommentsPerPage = 500

	def __init__(self, video, pageNumber):
	'''
	Initializes a comments page at the given page number (starting at 1,
	non-zero-indexed) for the given video.
	'''
	# Form the URL, then make the request in the base class
	super(CommentsPage, self).__init__('http://www.youtube.com/all_comments?v=%s&page=%d' % (video._id, pageNumber))

	# Check the number of comment elements in the HTML response
	comments = self._listCommentElements()
	self._numComments = len(comments)

	# Parse those elements into a set of Commenter data structures
	commenters = {self._getCommenter(c) for c in comments}
	self._commenters = {c for c in commenters if c}

	def getNumComments(self):
	'''
	Returns the total number of comment elements contained in the HTML
	response for this comment page.
	'''
	return self._numComments

	def getNumCommenters(self):
	'''
	Returns the total number of unique commenters parsed from this page's
	comments.
	'''
	return len(self._commenters)

	def getCommenters(self):
	'''
	Returns the set of Commenter objects representing the users whose
	comments are found on this page.
	'''
	return self._commenters

	def _listCommentElements(self):
	'''
	Returns all comment elements contained on this page.
	'''
	return self._soup.select('ul#all-comments > li.comment')

	def _getCommenter(self, commentElement):
	'''
	Creates and returns a new Commenter object for the given commenter
	element (li.comment under #all-comments). If the commenter element is
	invalid (e.g. for a deleted comment), returns None.
	'''
	try:
	authorElement = commentElement.select('span.author > a')[0]
	return Commenter(authorElement)
	except IndexError:
	return None

	def scrape_comments(videoId, verbose):
	'''
	Returns a set of Commenter objects representing all the users who made
	comments on the video with the given ID. If verbose is True, prints
	progress messages to the console.
	'''
	# Create a Video object from the video ID
	video = Video(videoId)

	# Announce the start of the scraping process
	log(verbose, 'Scraping comments for video with ID < %s >...' % videoId)
	log(verbose, 'Total comments: %d (Page requests: %d)' % (video.getNumComments(), video.getTotalPages()))

	commenters = set()
	pageNumber = 1

	while True:

	# Loop until the next page is expected to contain 0 comments
	expectedNumComments = video.getExpectedCommentCount(pageNumber)
	if expectedNumComments <= 0:
	break

	# Make the request for the page and parse the result
	log_partial(verbose, 'Getting comments from page %d... ' % pageNumber)
	page = CommentsPage(video, pageNumber)

	if page.getNumComments() <= 1 and expectedNumComments > 1:

	# Retry if we unexpectedly get only one commment
	log(verbose, 'only got %d. Retrying in %d seconds...' % (page.getNumComments(), kRetryIntervalSeconds))
	time.sleep(kRetryIntervalSeconds)

	else:

	# If we get more than one comment, extend the set and continue
	log(verbose, 'got %d! (unique: %d)' % (page.getNumComments(), page.getNumCommenters()))
	commenters \|= page.getCommenters()
	pageNumber += 1

	log(verbose, 'Finished scraping comments.')
	return commenters

	def write_commenters(commenters, filename, verbose):
	'''
	Writes the given set of commenters to a plain text file at the given
	filename. If verbose is True, prints status messages to the console.
	'''
	# Write the collected names to a text file
	log_partial(verbose, 'Writing usernames to %s... ' % filename)
	with open(filename, 'w') as fp:
	for commenter in commenters:
	fp.write('%s\n' % commenter)
	log(verbose, 'finished.')

	if __name__ == '__main__':

	if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
	print 'Set the value of kVideoId on line 28!'
	else:
	commenters = scrape_comments(kVideoId, kPrintProgressToConsole)
	write_commenters(commenters, 'commenters.txt', kPrintProgressToConsole)
	log(kPrintProgressToConsole, 'Randomly chosen commenter: %s' % (random.choice(list(commenters))))