Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Created October 8, 2010 17:38
Show Gist options
  • Save rafapolo/617179 to your computer and use it in GitHub Desktop.
Save rafapolo/617179 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# author: rafael polo
WIKI_PAGE = 'http://pt.wikipedia.org/w/index.php?title=Fisica&printable=yes'
print "============"
print "Carregando libs..."
import urllib2, re, htmlentitydefs
try:
import matplotlib.pyplot as plt
except:
raise
import networkx as nx
# pega html da url
def get_page(url):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
print "Carregando " + url
infile = opener.open(url)
return infile.read()
def get_first_paragraph(page):
# encontra tudo que estver entre <p>*</p>
ps = re.findall(r'<p>\s*(.+?)\s*</p>', page)
print "Encontrados %s parágrafos." % len(ps)
return decode_htmlentities(remove_html_tags(ps[0].decode("utf-8").lower())) #primeiro
def remove_html_tags(txt):
# retira tags html
p = re.compile(r'<[^<]*?/?>')
return p.sub('', txt)
# decodifica tag html para string
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#":
if match.group(2) == '':
return unichr(int(ent))
elif match.group(2) == 'x':
return unichr(int('0x'+ent, 16))
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
def decode_htmlentities(string):
entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
return entity_re.subn(substitute_entity, string)[0]
page = get_page(WIKI_PAGE)
p = get_first_paragraph(page)
# Retira espaços extras, pontos e vírgulas.
pattern = re.compile(r'\s+')
limpo = pattern.sub(' ', p)
limpo = limpo.replace(',', '')
limpo = limpo.replace(';', '')
limpo = limpo.replace('.', '')
# Separa palavras
palavras = limpo.split(" ")
count = len(palavras)
print "%s palavras." % count
print "Gerando Grafo..."
G=nx.Graph()
palavra_anterior = ""
for palavra in palavras:
if (palavra != ""):
if (G.has_node(palavra)):
peso = G.node[palavra]['peso']
peso += 1
size = peso*200
G.add_node(palavra, peso=peso, size=size)
else:
G.add_node(palavra, peso=1, size=200)
if (palavra_anterior != ""):
G.add_edge(palavra_anterior, palavra)
palavra_anterior = palavra
print "Renderizando..."
nx.draw(G, node_size=[G.node[n]['size'] for n in G.nodes()])
print "Ok"
print "============"
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment