Skip to content

Instantly share code, notes, and snippets.

Created August 15, 2014 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/bf67c8704e46fa4c454b to your computer and use it in GitHub Desktop.
Save anonymous/bf67c8704e46fa4c454b to your computer and use it in GitHub Desktop.
__author__ = 'Fatih'
#-*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
from peewee import *
import peewee
import news_db
import newspaper_db
def link_saver(name, link, image):
db = Newspaper('crawler', user= 'sqlalchemy', passwd='mypassword')
db.connect()
newspaper = Newspaper(name=name, link=link, image=image)
newspaper.save()
def news_saver(header, articleBody, articleDate):
db = News('crawler', user= 'sqlalchemy',passwd='mypassword')
news = News(header, articleBody, articleDate)
news.save()
def link_opener():
db = MySQLDatabase('crawler', user= 'sqlalchemy',passwd='mypassword')
for newspaper in Newspaper.select():
url = newspaper['href']
header = {'User-Agent': 'Mozilla/5.0'}
request = urllib2.Request(url, headers=header)
page = urllib2.urlopen(request)
soup = BeautifulSoup(page)
for divs in soup.findAll("div", {"class":"detayTop"}):
for articleDate in divs.findAll("div", {"class": "date"}):
for name in divs.findAll("h2", {"itremprop":"name"}):
for articleBody in divs.findAll("h3", {"itemprop": "articleBody"}):
news_saver(name,articleBody,articleDate)
def milliyet_crawler():
name = 'milliyet'
url = 'http://www.milliyet.com.tr'
header = {'User-Agent': 'Mozilla/5.0'}
request = urllib2.Request(url, headers=header)
page = urllib2.urlopen(request)
soup = BeautifulSoup(page)
divs = soup.find("div", {"class": "row"})
for div in divs.findAll("div"):
links = div.findAll('a')
imgs = div.findAll('img')
for link in links:
for img in imgs:
link_saver(name, link['href'], img['src'])
link_opener()
#initializer()
milliyet_crawler()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment