AnthonyBloomer/slidescrape.py

## slidescrape.py
from slideshare import Slideshare
from pymongo import MongoClient
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('topic')
    args = parser.parse_args()
    client = MongoClient('mongodb://localhost:27017/')
    db = client.slideshare
    has_pages = True
    page = 1
    while has_pages:
        try:
            slideshare = Slideshare()
            slideshows = slideshare.scrape(topic=args.topic, page_num=page)
            for slideshow in slideshows:
                ss = {
                    'title': slideshow.title(),
                    'description': slideshow.description(),
                    'publish_date': slideshow.publish_date(),
                    'views': slideshow.views(),
                    'favourites': slideshow.favourites(),
                    'author': slideshow.author(),
                    'comments': slideshow.comments(),
                    'categories': slideshow.categories()
                }
                db.slideshare.insert_one(ss)
        except:
            has_pages = False
            print 'Finished!'
        page += 1

## slideshare.py
import requests
import urllib
from bs4 import BeautifulSoup
from slideshow import Slideshow


class Slideshare(object):
    def scrape(self, topic, page_num):
        base = "https://www.slideshare.net"
        query = urllib.quote_plus(topic)
        url = base + "/search/slideshow?lang=en&page=%s&q=%s" % (page_num, query)
        print 'Processing: ' + url
        soup = self.call(url)
        titles = soup.find_all('a', {"class": 'title-link'}, href=True)

        slideshares = []

        for title in titles:
            link = base + title['href']
            soup = self.call(link)
            ss = Slideshow(soup)
            slideshares.append(ss)

        return slideshares

    def call(self, url):
        req = requests.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')
        return soup

## slideshow.py
class Slideshow(object):
    def __init__(self, soup):
        self.soup = soup

    def title(self):
        try:
            return self.soup.find('span', {'class': 'j-title-breadcrumb'}).text.strip()
        except AttributeError:
            return

    def description(self):
        try:
            return " ".join(self.soup.find('p', {'id': 'slideshow-description-paragraph'}).text.split())
        except AttributeError:
            return

    def author(self):
        try:
            return self.soup.find('a', {'class': 'j-author-name'}).text.strip()
        except AttributeError:
            return

    def favourites(self):
        try:
            return self.soup.find('span', {'class': 'j-favs-count'}).text.strip().split()[0].replace(',', '')
        except AttributeError:
            return

    def views(self):
        try:
            return self.soup.find('div', {'class': 'stat-value'}).text.strip().replace(',', '')
        except AttributeError:
            return

    def publish_date(self):
        try:
            return self.soup.find('time').text.strip()
        except AttributeError:
            return

    def categories(self):
        try:
            categories_container = self.soup.find('div', {'class': 'categories-container'})
            categories = categories_container.find_all('a')
            return [category.text.strip() for category in categories]
        except AttributeError:
            return

    def comments(self):
        try:
            comments = self.soup.find_all('div', {'class': 'commentText'})
            iter_comments = iter(comments)
            next(iter_comments)
            return [" ".join(comment.text.split()) for comment in iter_comments]
        except AttributeError:
            return
	from slideshare import Slideshare
	from pymongo import MongoClient
	import argparse

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('topic')
	args = parser.parse_args()
	client = MongoClient('mongodb://localhost:27017/')
	db = client.slideshare
	has_pages = True
	page = 1
	while has_pages:
	try:
	slideshare = Slideshare()
	slideshows = slideshare.scrape(topic=args.topic, page_num=page)
	for slideshow in slideshows:
	ss = {
	'title': slideshow.title(),
	'description': slideshow.description(),
	'publish_date': slideshow.publish_date(),
	'views': slideshow.views(),
	'favourites': slideshow.favourites(),
	'author': slideshow.author(),
	'comments': slideshow.comments(),
	'categories': slideshow.categories()
	}
	db.slideshare.insert_one(ss)
	except:
	has_pages = False
	print 'Finished!'
	page += 1
	import requests
	import urllib
	from bs4 import BeautifulSoup
	from slideshow import Slideshow


	class Slideshare(object):
	def scrape(self, topic, page_num):
	base = "https://www.slideshare.net"
	query = urllib.quote_plus(topic)
	url = base + "/search/slideshow?lang=en&page=%s&q=%s" % (page_num, query)
	print 'Processing: ' + url
	soup = self.call(url)
	titles = soup.find_all('a', {"class": 'title-link'}, href=True)

	slideshares = []

	for title in titles:
	link = base + title['href']
	soup = self.call(link)
	ss = Slideshow(soup)
	slideshares.append(ss)

	return slideshares

	def call(self, url):
	req = requests.get(url)
	soup = BeautifulSoup(req.content, 'html.parser')
	return soup
	class Slideshow(object):
	def __init__(self, soup):
	self.soup = soup

	def title(self):
	try:
	return self.soup.find('span', {'class': 'j-title-breadcrumb'}).text.strip()
	except AttributeError:
	return

	def description(self):
	try:
	return " ".join(self.soup.find('p', {'id': 'slideshow-description-paragraph'}).text.split())
	except AttributeError:
	return

	def author(self):
	try:
	return self.soup.find('a', {'class': 'j-author-name'}).text.strip()
	except AttributeError:
	return

	def favourites(self):
	try:
	return self.soup.find('span', {'class': 'j-favs-count'}).text.strip().split()[0].replace(',', '')
	except AttributeError:
	return

	def views(self):
	try:
	return self.soup.find('div', {'class': 'stat-value'}).text.strip().replace(',', '')
	except AttributeError:
	return

	def publish_date(self):
	try:
	return self.soup.find('time').text.strip()
	except AttributeError:
	return

	def categories(self):
	try:
	categories_container = self.soup.find('div', {'class': 'categories-container'})
	categories = categories_container.find_all('a')
	return [category.text.strip() for category in categories]
	except AttributeError:
	return

	def comments(self):
	try:
	comments = self.soup.find_all('div', {'class': 'commentText'})
	iter_comments = iter(comments)
	next(iter_comments)
	return [" ".join(comment.text.split()) for comment in iter_comments]
	except AttributeError:
	return