Skip to content

Instantly share code, notes, and snippets.

@AnthonyBloomer
Last active September 30, 2018 18:33
Show Gist options
  • Save AnthonyBloomer/75b5b0208e2ea046486c14fd77ee2f32 to your computer and use it in GitHub Desktop.
Save AnthonyBloomer/75b5b0208e2ea046486c14fd77ee2f32 to your computer and use it in GitHub Desktop.
from slideshare import Slideshare
from pymongo import MongoClient
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('topic')
args = parser.parse_args()
client = MongoClient('mongodb://localhost:27017/')
db = client.slideshare
has_pages = True
page = 1
while has_pages:
try:
slideshare = Slideshare()
slideshows = slideshare.scrape(topic=args.topic, page_num=page)
for slideshow in slideshows:
ss = {
'title': slideshow.title(),
'description': slideshow.description(),
'publish_date': slideshow.publish_date(),
'views': slideshow.views(),
'favourites': slideshow.favourites(),
'author': slideshow.author(),
'comments': slideshow.comments(),
'categories': slideshow.categories()
}
db.slideshare.insert_one(ss)
except:
has_pages = False
print 'Finished!'
page += 1
import requests
import urllib
from bs4 import BeautifulSoup
from slideshow import Slideshow
class Slideshare(object):
def scrape(self, topic, page_num):
base = "https://www.slideshare.net"
query = urllib.quote_plus(topic)
url = base + "/search/slideshow?lang=en&page=%s&q=%s" % (page_num, query)
print 'Processing: ' + url
soup = self.call(url)
titles = soup.find_all('a', {"class": 'title-link'}, href=True)
slideshares = []
for title in titles:
link = base + title['href']
soup = self.call(link)
ss = Slideshow(soup)
slideshares.append(ss)
return slideshares
def call(self, url):
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
return soup
class Slideshow(object):
def __init__(self, soup):
self.soup = soup
def title(self):
try:
return self.soup.find('span', {'class': 'j-title-breadcrumb'}).text.strip()
except AttributeError:
return
def description(self):
try:
return " ".join(self.soup.find('p', {'id': 'slideshow-description-paragraph'}).text.split())
except AttributeError:
return
def author(self):
try:
return self.soup.find('a', {'class': 'j-author-name'}).text.strip()
except AttributeError:
return
def favourites(self):
try:
return self.soup.find('span', {'class': 'j-favs-count'}).text.strip().split()[0].replace(',', '')
except AttributeError:
return
def views(self):
try:
return self.soup.find('div', {'class': 'stat-value'}).text.strip().replace(',', '')
except AttributeError:
return
def publish_date(self):
try:
return self.soup.find('time').text.strip()
except AttributeError:
return
def categories(self):
try:
categories_container = self.soup.find('div', {'class': 'categories-container'})
categories = categories_container.find_all('a')
return [category.text.strip() for category in categories]
except AttributeError:
return
def comments(self):
try:
comments = self.soup.find_all('div', {'class': 'commentText'})
iter_comments = iter(comments)
next(iter_comments)
return [" ".join(comment.text.split()) for comment in iter_comments]
except AttributeError:
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment