Skip to content

Instantly share code, notes, and snippets.

@itsJlot
Created August 1, 2019 00:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itsJlot/4429d9505bc35f06e88c4f26d35b6076 to your computer and use it in GitHub Desktop.
Save itsJlot/4429d9505bc35f06e88c4f26d35b6076 to your computer and use it in GitHub Desktop.
Wikipedia Scraper with networkX graphing, faster setup than neo4j version
import requests, bs4 as bs, time as t,re,networkx as nx,matplotlib.pyplot as plot
graph = nx.Graph()
def getBSoupFromLink(link):
return bs.BeautifulSoup(requests.get(link).text,"html.parser")
for x in range(10):
curLink = "/wiki/Special:Random"
startingTitle = "Random"
for x in range(5):
startingLink = curLink
bsoup = getBSoupFromLink("https://en.wikipedia.org"+curLink)
heading = bsoup.find("h1",attrs={"id":"firstHeading"}).text
tables = bsoup.find_all("table")
if len(tables) > 0:
infotable = tables
for table in infotable:
table.decompose()
thumbs = bsoup.find_all("div",class_=True)
if len(thumbs) > 0:
thumb = thumbs[0]
thumb.decompose()
mwpo = bsoup.find("div",attrs={"class":"mw-parser-output"})
if mwpo:
links = mwpo.find_all("a",href=re.compile("^/w[^:#]*$"))#,class_=["mw-redirect",False])
else:
break
for link in links:
if not ("anguage" in link["href"] or bool(re.search("[#?]",link["href"]))):
if not link.parent.has_attr("role") and not link.parent.has_attr("class") and not link.parent.name == u"i":
curLink = link["href"]
break
print(startingTitle,"->",heading, "\n", curLink, "\n", "-" * 50)
if not graph.has_node(heading):
graph.add_node(heading)
else:
if not graph.has_edge(startingTitle,heading):
graph.add_edge(startingTitle,heading)
print("node " + heading + " already exists")
break
if not graph.has_node(startingTitle):
graph.add_node(startingTitle)
if not graph.has_edge(startingTitle,heading):
graph.add_edge(startingTitle,heading)
else:
break
startingTitle = heading
nx.draw_networkx(graph,font_size = 5,nodesize=50)
plot.show()
nx.node_link_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment