Skip to content

Instantly share code, notes, and snippets.

@controversial
Created January 29, 2016 23:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save controversial/6de2f9c22e3fd4865f06 to your computer and use it in GitHub Desktop.
Save controversial/6de2f9c22e3fd4865f06 to your computer and use it in GitHub Desktop.
Wikipedia Scraper
#Scrapes wikipedia. Start with the name of a page. Then, it will click the first
#5 links on this page. For each of these links, it will click the first 5 links
#on *that* page. It will not stray more than 5 pages away from the start page.
#These attributes can be adjusted by changing BREADTH and MAXDEPTH. This script
#will output a PNG file of your wikipedia map.
#REQUIREMENTS: `wikipedia` and `pydot`
import wikipedia as wp
import pydot
def ascii(inp):
return str(inp.encode("ascii",errors="ignore"))
class WikiScraper:
def __init__(self, startpage,maxbreadth=10):
self.map={}
self.startpage=startpage
self.maxbreadth=maxbreadth
self.maxdepth=0
self.graph=pydot.Dot(graph_type="graph")
def connect(self,parent,children):
self.map[parent] = children
for child in children:
edge=pydot.Edge(ascii(parent),ascii(child))
self.graph.add_edge(edge)
def explore(self,pagename,depth):
#Return if we've exceeded max depth.
if depth==self.maxdepth:
return
#Return if we've already visited a page
if pagename in self.map.keys():
return
try:
page=wp.page(pagename)
except wp.exceptions.DisambiguationError:
#Return in the event of reaching a disambiguation page
return
except wp.exceptions.PageError:
#We've tried to find a page that doesn't exist
print "The page {} could not be found".format(pagename)
return
print "Exploring "+pagename.encode("utf-8")+" at depth "+str(depth)
links=page.links[:self.maxbreadth]
self.connect(pagename,links)
for link in links:
self.explore(link,depth+1)
def start(self,maxdepth=0):
self.maxdepth=maxdepth
self.explore(self.startpage,1)
if __name__ == "__main__":
STARTPAGE=raw_input("Name of page to start at: ")
BREADTH=5
MAXDEPTH=5
w=WikiScraper(STARTPAGE,BREADTH)
w.start(MAXDEPTH)
w.graph.write_png(STARTPAGE+".png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment