Skip to content

Instantly share code, notes, and snippets.

@gyu-don
Created June 26, 2016 05:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gyu-don/0f4a99d7b7d569bea0cb34d7ac73fd0c to your computer and use it in GitHub Desktop.
Save gyu-don/0f4a99d7b7d569bea0cb34d7ac73fd0c to your computer and use it in GitHub Desktop.
wp_hops Part-2
import sys
import getpass
import mysql.connector
class WordNotFoundError(Exception):
def __init__(self, word):
self.word = word
def __str__(self):
return self.word + " was not found."
class PageIdNotFoundError(Exception):
def __init__(self, pid):
self.pid = pid
def __str(self):
return self.pid + " was not found."
class LinkNotFoundError(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return self.msg
def get_pageid(c, w):
c.execute("SELECT page_id FROM page WHERE page_namespace=0 AND page_title=%s", (w,))
result = c.fetchone()
if result is None:
raise WordNotFoundError(w)
return result[0]
def get_title(c, pid):
c.execute("SELECT page_title FROM page WHERE page_id=%s", (pid,))
result = c.fetchone()
if result is None:
raise PageIdNotFoundError(pid)
return result[0].decode("utf-8")
def get_linkfrom(c, w):
c.execute("SELECT pl_from FROM pagelinks WHERE pl_from_namespace=0 AND pl_namespace=0 AND pl_title=%s", (w,))
result = c.fetchall()
if result:
return [t[0] for t in result]
else:
return []
def wp_hops(c, w_from, w_to):
# Raise WordNotFoundError if input w_from, w_to are not in Wikipedia.
get_pageid(c, w_to)
get_pageid(c, w_from)
title_list = [w_to]
links = {}
n_link = 0 # for debug purpose.
while 1:
next_title_list = []
for title in title_list:
print(n_link, title)
c.execute("""SELECT page_title FROM page JOIN pagelinks ON page.page_id=pagelinks.pl_from
WHERE pagelinks.pl_from_namespace=0 AND
pagelinks.pl_namespace=0 AND
pagelinks.pl_title=%s""", (title,))
from_titles = [a[0].decode("utf-8") for a in c.fetchall()]
for ft in from_titles:
if ft == w_from:
result = [w_from, title]
t = title
while t != w_to:
t = links[t]
result.append(t)
return result
if ft not in links:
links[ft] = title
next_title_list.append(ft)
title_list = next_title_list
n_link += 1
if __name__ == "__main__":
if len(sys.argv) == 4:
user = sys.argv[1]
pw = getpass.getpass()
w_from = sys.argv[2]
w_to = sys.argv[3]
elif len(sys.argv) == 5:
user = sys.argv[1]
pw = sys.argv[2]
w_from = sys.argv[3]
w_to = sys.argv[4]
conn = mysql.connector.Connect(user=user, password=pw, db="jawiki", charset="utf8")
c = conn.cursor()
print(wp_hops(c, w_from, w_to))
c.close()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment