Skip to content

Instantly share code, notes, and snippets.

@Mec-iS
Last active October 12, 2015 15:10
Show Gist options
  • Save Mec-iS/d0d8aa27ad1842dd81a7 to your computer and use it in GitHub Desktop.
Save Mec-iS/d0d8aa27ad1842dd81a7 to your computer and use it in GitHub Desktop.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This is just an exercise for a particular page, the data has not been used
# or stored anywhere.
import urllib
from bs4 import BeautifulSoup
from unidecode import unidecode
__author__ = 'Lorenzo'
from urllib import FancyURLopener
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
results = []
def crawl_video_urls(url='http://*********************'):
myopener = MyOpener()
page = myopener.open(url)
page = page.read()
html = BeautifulSoup(page, "lxml")
# find all class=post
posts = html.find_all('div', class_="post")
# for each class=post:
for p in posts:
obj = {}
#class=post-title --> a (href, string)
title = p.find('h2').find('a')
obj['url'] = title['href']
obj['title'] = unidecode(title.string)
abstract = p.find('div', class_='browse-description').find('p')
obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
#class=browse-description --> p (string)
results.append(obj)
# next page: class=next --> (href)
next_page = html.find('a', class_="next page-numbers")
if not next_page:
return None
print results
print next_page['href']
return crawl_video_urls(url=next_page['href'])
crawl_video_urls()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment