Skip to content

Instantly share code, notes, and snippets.

@AdolfVonKleist
Last active December 16, 2015 01:09
Show Gist options
  • Save AdolfVonKleist/5352560 to your computer and use it in GitHub Desktop.
Save AdolfVonKleist/5352560 to your computer and use it in GitHub Desktop.
More usable, simple, standalone WSGI interface to the Balanced Corpus of Contemporary Written Japanese: http://www.kotonoha.gr.jp/shonagon/ Run the script from a termina: $ ./get-bccwj-examples.py then view in your browser at: localhost:8000. Run the script with '--help' to see additional options. Script will cache your vocabulary search results…
#!/usr/bin/python
# -*- mode: python; coding: utf-8 -*-
from wsgiref.simple_server import make_server
from cgi import parse_qs, escape
import urllib, urllib2, cookielib, sqlite3, re
html = """
<html>
<head>
<style type="text/css">
/* form css from: http://webdesignerwall.com/tutorials/beautiful-css3-search-form */
.searchform {
display: inline-block;
zoom: 1;
border: solid 1px #d2d2d2;
padding: 3px 5px;
box-shadow: 0 1px 0px rgba(0,0,0,.1);
background: #f1f1f1;
background: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#ededed));
background: -moz-linear-gradient(top, #fff, #ededed);
border-radius: 2em;
filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie7 */
-webkit-border-radius: 2em;
-moz-border-radius: 2em;
-webkit-box-shadow: 0 1px 0px rgba(0,0,0,.1);
-moz-box-shadow: 0 1px 0px rgba(0,0,0,.1);
-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie8 */
*display: inline;
}
.searchform input {
font: normal 12px Arial, Helvetica, sans-serif;
}
.searchform .searchfield {
background: #fff;
padding: 6px 6px 6px 8px;
width: 202px;
border: solid 1px #bcbbbb;
outline: none;
-webkit-border-radius: 2em;
-moz-border-radius: 2em;
border-radius: 2em;
-moz-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
-webkit-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
}
.searchform .searchbutton {
color: #fff;
border: solid 1px #494949;
font-size: 11px;
height: 27px;
width: 57px;
text-shadow: 0 1px 1px rgba(0,0,0,.6);
-webkit-border-radius: 2em;
-moz-border-radius: 2em;
border-radius: 2em;
background: #5f5f5f;
background: -webkit-gradient(linear, left top, left bottom, from(#9e9e9e), to(#454545));
background: -moz-linear-gradient(top, #9e9e9e, #454545);
filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie7 */
-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie8 */
}
td.cell01 { width: 250px; border-bottom: 2px solid green; }
td.cell02 { width: 100px; font-weight: bold; color: navy; text-align: center; border-bottom: 2px solid green; }
td.cell03 { width: 250px; border-bottom: 2px solid green; }
</style>
</head>
<body>
<div id="searchdiv" style="padding-left: 30px; padding-top: 20px;">
<p>Enter a Japanese word that you would like to find example sentences for.</p>
<div style="font-size: .8em;">
<p>Intended for study purposes and individual use only. <br />
See the official <a href="http://www.kotonoha.gr.jp/shonagon/">BCCWJ</a> website for details.</p>
</div>
<form class="searchform" method="get" action="parsing_get.wsgi">
<input
type="text" name="tango" class="searchfield"
value=" 単語を入力して下さい"
onfocus="if(this.value==this.defaultValue)this.value='';"
onblur="if(this.value=='')this.value=.this.defaultValue;" >
<input type="submit" value="検索する" class="searchbutton">
</form>
</div>
<div style="padding-left: 20px;">
%s
</div>
</body>
</html>"""
table_header = """<table>
<tr>
<th class="sample cell01 reversetext">前文脈</th>
<th class="sample cell02 nosort">検索文字列</th>
<th class="sample cell03 text">後文脈</th>
</tr>"""
def retrieve_response( tango ):
"""
Retrieve the table of example sentences for the
desired vocabulary item. Searches and scrapes the
results from the
Balanced Corpus of Contemporary Written Japanese:
http://www.kotonoha.gr.jp/shonagon
"""
cookie_jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
opener.addheaders.append(('User-agent', 'Mozilla/5.0'))
opener.addheaders.append(('Referer','http://www.kotonoha.gr.jp/shonagon/search_form'))
urllib2.install_opener(opener)
## acquire cookie
url_1 = 'http://www.kotonoha.gr.jp/shonagon/search_form'
req = urllib2.Request(url_1)
rsp = urllib2.urlopen(req)
# do POST
url_2 = 'http://www.kotonoha.gr.jp/shonagon/search_result'
# skip '書箱' and '韻文'
types = [
'雑誌','新聞','白書',
'教科書','広報紙','Yahoo!知恵袋',
'Yahoo!ブログ','法律','国会会議録'
]
etypes = [ urllib.quote_plus(x) for x in types ]
values = dict(
query_string=tango,
lcontext_regex='',
rcontext_regex='',
entire_period='1',
)
data = urllib.urlencode(values)
for t in etypes:
data += "&media="+t
req = urllib2.Request(url_2, data)
rsp = urllib2.urlopen(req)
content = rsp.read()
table = []
entry = []
for line in content.split("\n"):
if "<th" in line:
continue
if "cell0" in line:
line = re.sub(r"<td[^>]*>","",line)
line = line.replace("</td>","\t")
line = line.strip().decode("utf8")
entry.append(line)
if len(entry)==3:
table.append([entry[1], entry[0], entry[2]])
entry = []
return table
def fetch_result( tango ):
#Open the DB or create it if it doesn't exist
conn = sqlite3.connect("sentences.db")
cursor = conn.cursor()
cursor.execute("""CREATE TABLE IF NOT EXISTS sentences (tango text, lc text, rc text)""")
#Check whether we already have the entries for this word
# stored in the local DB
sql = "SELECT * FROM sentences WHERE tango=?"
cursor.execute(sql, [(tango.decode("utf8"))])
sentences = cursor.fetchall()
if len(sentences)>0:
return sentences
else:
#Didn't find anything. Try and fetch from online
sentences = retrieve_response( tango )
#Finally insert whatever we found online into the local
# DB so we don't have to waste time and resources fetching
# it again later on during review
for sent in sentences:
sql = "INSERT INTO sentences VALUES ('%s', '%s', '%s' )"
sql = sql % ( tango.decode("utf8"), sent[1], sent[2] )
cursor.execute( sql )
conn.commit( )
return sentences
def application(environ, start_response):
"""
Purloined from:
http://webpython.codepoint.net/wsgi_request_parsing_get
and modified for the purpose of this app.
"""
# Returns a dictionary containing lists as values.
d = parse_qs(environ['QUERY_STRING'])
# In this idiom you must issue a list containing a default value.
tango = d.get('tango', [''])[0]
table = table_header
if tango:
sentences = fetch_result( tango )
for sent in sentences:
row = "<tr><td class='cell01'>"+sent[1].encode("utf8")+\
"</td><td class='cell02'>"+sent[0].encode("utf8")+\
"</td><td class='cell03'>"+sent[2].encode("utf8")+"</td></tr>"
table = table+row
table = table + "</table>"
response_body = html % ( table )
status = '200 OK'
# Now content type is text/html
response_headers = [('Content-Type', 'text/html'),
('Content-Length', str(len(response_body)))]
start_response(status, response_headers)
return [response_body]
if __name__=="__main__":
import sys, argparse
parser = argparse.ArgumentParser(
description="""Simple standalone WSGI script for
retrieving example sentences from the Balanced Corpus of
Contemporary Written Japanese.""" )
parser.add_argument('--host', '-i', help="host name to use. Default to localhost.", default="localhost" )
parser.add_argument('--port', '-p', help="port to use. Default to '8000'.", default=80, type=int )
parser.add_argument('--verbose', '-v', help="Verbose mode.", default=False, action="store_true" )
args = parser.parse_args()
if args.verbose==True:
for k,v in args.__dict__.items():
print k,"=",v
httpd = make_server('localhost', 8000, application)
httpd.serve_forever( )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment