Skip to content

Instantly share code, notes, and snippets.

@itsJlot
Created August 1, 2019 00:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itsJlot/e4edc8f68aa98903b0ba3371400209b0 to your computer and use it in GitHub Desktop.
Save itsJlot/e4edc8f68aa98903b0ba3371400209b0 to your computer and use it in GitHub Desktop.
Wikipedia scraper with Neo4J Graph
import requests, bs4 as bs, time as t,re,matplotlib.pyplot as plot,py2neo as neo
graph = nx.Graph()
neograph = neo.Graph("bolt://localhost:7687",auth=("neo4j","neo4jpass"))
matcher = neo.NodeMatcher(neograph)
randomNode = matcher.match("Article", name="Random").first()
if not randomNode:
randomNode = neo.Node("Article", name="Random")
neograph.create(randomNode)
def getBSoupFromLink(link):
return bs.BeautifulSoup(requests.get(link).text,"html.parser")
for x in range(50):
curLink = "/wiki/Special:Random"
startingTitle = "Random"
startingNode = randomNode
for x in range(300):
#print(type(curLink))
startingLink = curLink
bsoup = getBSoupFromLink("https://en.wikipedia.org"+curLink)
heading = bsoup.find("h1",attrs={"id":"firstHeading"}).text
tables = bsoup.find_all("table")
if len(tables) > 0:
infotable = tables
for table in infotable:
table.decompose()
thumbs = bsoup.find_all("div",class_=True)
if len(thumbs) > 0:
thumb = thumbs[0]
thumb.decompose()
mwpo = bsoup.find("div",attrs={"class":"mw-parser-output"})
if mwpo:
links = mwpo.find_all("a",href=re.compile("^/w[^:#]*$"))#,class_=["mw-redirect",False])
else:
break
for link in links:
if not ("anguage" in link["href"] or bool(re.search("[#?]",link["href"]))):
#print(link["href"], " does not contain #? or Language", ("anguage" in link["href"]), bool(re.search("[#?]",link["href"])))
if not link.parent.has_attr("role") and not link.parent.has_attr("class") and not link.parent.name == u"i":
curLink = link["href"]
break
print(startingTitle,"->",heading, "\n", curLink, "\n", "-" * 50)
currentNodeList = matcher.match("Article", name=heading).limit(1)
if len(currentNodeList) > 0:
currentNode = currentNodeList.first()
neograph.create(neo.Relationship(startingNode, "LinksTo", currentNode))
break
else:
currentNode = neo.Node("Article",name=heading)
neograph.create(currentNode)
print(startingNode,currentNode)
neograph.create(neo.Relationship(startingNode,"LinksTo",currentNode))
startingTitle = heading
startingNode = currentNode
#nx.draw_networkx(graph,font_size = 5,nodesize=50)
#plot.show()
#nx.node_link_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment