Mec-iS/crawling_a_page.py

## crawling_a_page.py
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# This is just an exercise for a particular page, the data has not been used
# or stored anywhere.

import urllib
from bs4 import BeautifulSoup
from unidecode import unidecode

__author__ = 'Lorenzo'

from urllib import FancyURLopener

class MyOpener(FancyURLopener):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

results = []


def crawl_video_urls(url='http://*********************'):
    myopener = MyOpener()
    page = myopener.open(url)
    page = page.read()

    html = BeautifulSoup(page, "lxml")

    # find all class=post
    posts = html.find_all('div', class_="post")

    # for each class=post:
    for p in posts:
        obj = {}
        #class=post-title --> a (href, string)
        title = p.find('h2').find('a')
        obj['url'] = title['href']
        obj['title'] = unidecode(title.string)
        abstract = p.find('div', class_='browse-description').find('p')
        obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
        #class=browse-description --> p (string)

        results.append(obj)
    # next page: class=next --> (href)
    next_page = html.find('a', class_="next page-numbers")

    if not next_page:
        return None
    print results
    print next_page['href']

    return crawl_video_urls(url=next_page['href'])

crawl_video_urls()
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	#
	# This is just an exercise for a particular page, the data has not been used
	# or stored anywhere.

	import urllib
	from bs4 import BeautifulSoup
	from unidecode import unidecode

	__author__ = 'Lorenzo'

	from urllib import FancyURLopener

	class MyOpener(FancyURLopener):
	version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'

	results = []


	def crawl_video_urls(url='http://*********************'):
	myopener = MyOpener()
	page = myopener.open(url)
	page = page.read()

	html = BeautifulSoup(page, "lxml")

	# find all class=post
	posts = html.find_all('div', class_="post")

	# for each class=post:
	for p in posts:
	obj = {}
	#class=post-title --> a (href, string)
	title = p.find('h2').find('a')
	obj['url'] = title['href']
	obj['title'] = unidecode(title.string)
	abstract = p.find('div', class_='browse-description').find('p')
	obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
	#class=browse-description --> p (string)

	results.append(obj)
	# next page: class=next --> (href)
	next_page = html.find('a', class_="next page-numbers")

	if not next_page:
	return None
	print results
	print next_page['href']

	return crawl_video_urls(url=next_page['href'])

	crawl_video_urls()