Skip to content

Instantly share code, notes, and snippets.

@maheshkkumar
Created November 24, 2016 05:02
Show Gist options
  • Save maheshkkumar/3ac9961d04afdb6cb667542ac5220bb4 to your computer and use it in GitHub Desktop.
Save maheshkkumar/3ac9961d04afdb6cb667542ac5220bb4 to your computer and use it in GitHub Desktop.
import urllib2
from lxml import html
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, DuplicateKeyError
import datetime
import time
DB_NAME = 'news_aggregator'
try:
DATABASE = MongoClient()[DB_NAME]
except ConnectionFailure:
print "Connection Error"
# BASE_URL = [
# "http://www.datatau.com/x?fnid=bbwdy1hKwE",
# "http://www.datatau.com/x?fnid=3W6vX6uCp9",
# "http://www.datatau.com/x?fnid=yrjPOamlb3",
# "http://www.datatau.com/x?fnid=NC98ffV5Lx",
# "http://www.datatau.com/x?fnid=3Pv8lcbswJ",
# "http://www.datatau.com/x?fnid=LZLi7SprVG",
# "http://www.datatau.com/"
# ]
URL = "http://www.datatau.com"
def news_scraper():
# for url in BASE_URL:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page)
news = soup('td', {'class': 'title'})
links = []
for i in xrange(len(news)):
lst = []
if(i%2==1):
lst.append(news[i].a.get('href').encode('utf-8'))
lst.append(re.sub("[^A-Za-z0-9\:\+\*\%\$\#\@\!\&\?\=\/\,\|\-\_\\.'""()[]]+", ' ', news[i].a.string.encode("utf-8")))
links.append(lst)
count = 1
# links.reverse()
for link in links:
try:
print "\n"
print "Description --> {}".format(link[1])
print "Link ---------> {}".format(link[0])
choice = int(raw_input("Do you want to save the link? : "))
if choice == 1:
category = str(raw_input("Enter the category : "))
data = {
"_id": link[0],
"description": link[1],
"category": category,
"createdAt": datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
"score": 1,
"likes": 1,
"user": "admin"
}
# print data
bool_insert = DATABASE.articles.insert(data)
except DuplicateKeyError:
print "{}. {} was not inserted into the DB".format(count, link[1])
count = count + 1
if __name__ == '__main__':
news_scraper()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment