Skip to content

Instantly share code, notes, and snippets.

@iannsp
Created February 13, 2014 02:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iannsp/10e63b6cbc97ae6ecf1f to your computer and use it in GitHub Desktop.
Save iannsp/10e63b6cbc97ae6ecf1f to your computer and use it in GitHub Desktop.
import time
import re
import urllib2
import datetime
import sys
import os, errno
from BeautifulSoup import BeautifulSoup
from pymongo import MongoClient
import nltk
reload(sys)
sys.setdefaultencoding('utf-8')
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
def getData(source):
response = urllib2.urlopen(source[1])
if source[2] == "utf-8":
html = response.read().decode().encode("utf-8")
else:
html = response.read().decode(source[2]).encode("utf-8")
soup = BeautifulSoup(html)
pageText = soup.findAll(text=True)
visible_texts = filter(visible, pageText)
clearRaw =''.join(visible_texts).replace("&nbsp;", " ").replace("&gt;","").lower()
return clearRaw
timestampForProcess = time.time()
client = MongoClient('localhost', 27017)
portalData = client.portalData
portals = [["terra","http://terra.com.br/","utf-8"],["uol","http://uol.com.br","utf-8"],["folha","http://folha.com.br","cp1252"],["estadao","http://estadao.com.br","utf-8"]]
for portal in portals:
print "processing "+portal[0]+" from "+ portal[1]
data = getData(portal)
gram = nltk.word_tokenize(data)
bgram = nltk.bigrams(gram)
tgram = nltk.trigrams(gram)
portalData.portal.insert({"name":portal[0],"momentum":timestampForProcess, "source": data,"gram":gram,"bgram":bgram,"tgram":tgram});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment