maheshkkumar/Hacker News

## Hacker News
import urllib2
from lxml import html
from bs4 import BeautifulSoup
import re
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, DuplicateKeyError
import datetime
import time

DB_NAME = 'news_aggregator'

try:
	DATABASE = MongoClient()[DB_NAME]
except ConnectionFailure:
	print "Connection Error"

# BASE_URL = [
# 			"http://www.datatau.com/x?fnid=bbwdy1hKwE",
# 			"http://www.datatau.com/x?fnid=3W6vX6uCp9",
# 			"http://www.datatau.com/x?fnid=yrjPOamlb3",
# 			"http://www.datatau.com/x?fnid=NC98ffV5Lx",
# 			"http://www.datatau.com/x?fnid=3Pv8lcbswJ",
# 			"http://www.datatau.com/x?fnid=LZLi7SprVG",
# 			"http://www.datatau.com/"
# 			]

URL = "http://www.datatau.com"

def news_scraper():
	# for url in BASE_URL:
	page = urllib2.urlopen(URL).read()
	soup = BeautifulSoup(page)
	news = soup('td', {'class': 'title'})
	links = []
	for i in xrange(len(news)):
		lst = []
		if(i%2==1):
			lst.append(news[i].a.get('href').encode('utf-8'))
			lst.append(re.sub("[^A-Za-z0-9\:\+\*\%\$\#\@\!\&\?\=\/\,\|\-\_\\.'""()[]]+", ' ', news[i].a.string.encode("utf-8")))
			links.append(lst)

	count = 1
	# links.reverse()
	for link in links:
		try:
			print "\n"
			print "Description --> {}".format(link[1])
			print "Link ---------> {}".format(link[0])
			choice = int(raw_input("Do you want to save the link? : "))
			if choice == 1:
				category = str(raw_input("Enter the category : "))
				data = {
					"_id": link[0],
					"description": link[1],
					"category": category,
					"createdAt": datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
					"score": 1,
					"likes": 1,
					"user": "admin"
				}
				# print data
				bool_insert = DATABASE.articles.insert(data)
	 	except DuplicateKeyError:
			print "{}. {} was not inserted into the DB".format(count, link[1])
			count = count + 1

if __name__ == '__main__':
	news_scraper()
	import urllib2
	from lxml import html
	from bs4 import BeautifulSoup
	import re
	from pymongo import MongoClient
	from pymongo.errors import ConnectionFailure, DuplicateKeyError
	import datetime
	import time

	DB_NAME = 'news_aggregator'

	try:
	DATABASE = MongoClient()[DB_NAME]
	except ConnectionFailure:
	print "Connection Error"

	# BASE_URL = [
	# "http://www.datatau.com/x?fnid=bbwdy1hKwE",
	# "http://www.datatau.com/x?fnid=3W6vX6uCp9",
	# "http://www.datatau.com/x?fnid=yrjPOamlb3",
	# "http://www.datatau.com/x?fnid=NC98ffV5Lx",
	# "http://www.datatau.com/x?fnid=3Pv8lcbswJ",
	# "http://www.datatau.com/x?fnid=LZLi7SprVG",
	# "http://www.datatau.com/"
	# ]

	URL = "http://www.datatau.com"

	def news_scraper():
	# for url in BASE_URL:
	page = urllib2.urlopen(URL).read()
	soup = BeautifulSoup(page)
	news = soup('td', {'class': 'title'})
	links = []
	for i in xrange(len(news)):
	lst = []
	if(i%2==1):
	lst.append(news[i].a.get('href').encode('utf-8'))
	lst.append(re.sub("[^A-Za-z0-9\:\+\*\%\$\#\@\!\&\?\=\/\,\\|\-\_\\.'""()[]]+", ' ', news[i].a.string.encode("utf-8")))
	links.append(lst)

	count = 1
	# links.reverse()
	for link in links:
	try:
	print "\n"
	print "Description --> {}".format(link[1])
	print "Link ---------> {}".format(link[0])
	choice = int(raw_input("Do you want to save the link? : "))
	if choice == 1:
	category = str(raw_input("Enter the category : "))
	data = {
	"_id": link[0],
	"description": link[1],
	"category": category,
	"createdAt": datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
	"score": 1,
	"likes": 1,
	"user": "admin"
	}
	# print data
	bool_insert = DATABASE.articles.insert(data)
	except DuplicateKeyError:
	print "{}. {} was not inserted into the DB".format(count, link[1])
	count = count + 1

	if __name__ == '__main__':
	news_scraper()