AdolfVonKleist/get-bccwj-examples.py

## get-bccwj-examples.py
#!/usr/bin/python
# -*- mode: python; coding: utf-8 -*-
from wsgiref.simple_server import make_server
from cgi import parse_qs, escape
import urllib, urllib2, cookielib, sqlite3, re

html = """
<html>
  <head>
   <style type="text/css">
     /* form css from: http://webdesignerwall.com/tutorials/beautiful-css3-search-form */
     .searchform {
         display:    inline-block;
         zoom:       1;
         border:     solid 1px #d2d2d2;
         padding:    3px 5px;
         box-shadow: 0 1px 0px rgba(0,0,0,.1);
         background: #f1f1f1;
         background: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#ededed));
         background: -moz-linear-gradient(top,  #fff,  #ededed);
         border-radius: 2em;
         filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie7 */

         -webkit-border-radius: 2em;
         -moz-border-radius:    2em;
         -webkit-box-shadow:    0 1px 0px rgba(0,0,0,.1);
         -moz-box-shadow:       0 1px 0px rgba(0,0,0,.1);
         -ms-filter:            progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie8 */

         *display: inline;
     }

     .searchform input {
         font: normal 12px Arial, Helvetica, sans-serif;
     }

     .searchform .searchfield {
         background: #fff;
         padding: 6px 6px 6px 8px;
         width: 202px;
         border: solid 1px #bcbbbb;
         outline: none;

         -webkit-border-radius: 2em;
         -moz-border-radius: 2em;
         border-radius: 2em;

         -moz-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
         -webkit-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
         box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
     }

     .searchform .searchbutton {
         color: #fff;
         border: solid 1px #494949;
         font-size: 11px;
         height: 27px;
         width: 57px;
         text-shadow: 0 1px 1px rgba(0,0,0,.6);

         -webkit-border-radius: 2em;
         -moz-border-radius: 2em;
         border-radius: 2em;

         background: #5f5f5f;
         background: -webkit-gradient(linear, left top, left bottom, from(#9e9e9e), to(#454545));
         background: -moz-linear-gradient(top,  #9e9e9e,  #454545);
         filter:  progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie7 */
         -ms-filter:  progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie8 */
     }

     td.cell01 { width: 250px; border-bottom: 2px solid green; }
     td.cell02 { width: 100px; font-weight: bold; color: navy; text-align: center; border-bottom: 2px solid green; }
     td.cell03 { width: 250px; border-bottom: 2px solid green; }
   </style>
  </head>

  <body>
   <div id="searchdiv" style="padding-left: 30px; padding-top: 20px;">
   <p>Enter a Japanese word that you would like to find example sentences for.</p>
   <div style="font-size: .8em;">
     <p>Intended for study purposes and individual use only. <br />
       See the official <a href="http://www.kotonoha.gr.jp/shonagon/">BCCWJ</a> website for details.</p>
   </div>
   <form class="searchform" method="get" action="parsing_get.wsgi">
     <input
       type="text" name="tango" class="searchfield"
       value="   単語を入力して下さい"
       onfocus="if(this.value==this.defaultValue)this.value='';"
       onblur="if(this.value=='')this.value=.this.defaultValue;" >
     <input type="submit" value="検索する" class="searchbutton">
   </form>
   </div>
   <div style="padding-left: 20px;">
     %s
   </div>
  </body>
</html>"""

table_header = """<table>
                    <tr>
                     <th class="sample cell01 reversetext">前文脈</th>
                     <th class="sample cell02 nosort">検索文字列</th>
                     <th class="sample cell03 text">後文脈</th>
                    </tr>"""

def retrieve_response( tango ):
    """
     Retrieve the table of example sentences for the
     desired vocabulary item.  Searches and scrapes the
     results from the
       Balanced Corpus of Contemporary Written Japanese:
        http://www.kotonoha.gr.jp/shonagon
    """

    cookie_jar = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
    opener.addheaders.append(('User-agent', 'Mozilla/5.0'))
    opener.addheaders.append(('Referer','http://www.kotonoha.gr.jp/shonagon/search_form'))

    urllib2.install_opener(opener)

    ## acquire cookie
    url_1 = 'http://www.kotonoha.gr.jp/shonagon/search_form'
    req = urllib2.Request(url_1)
    rsp = urllib2.urlopen(req)

    #  do POST
    url_2 = 'http://www.kotonoha.gr.jp/shonagon/search_result'
    # skip '書箱' and '韻文'
    types = [
        '雑誌','新聞','白書',
        '教科書','広報紙','Yahoo!知恵袋',
        'Yahoo!ブログ','法律','国会会議録'
        ]
    etypes = [ urllib.quote_plus(x) for x in types ]

    values = dict(
        query_string=tango,
        lcontext_regex='',
        rcontext_regex='',
        entire_period='1',
        )

    data = urllib.urlencode(values)
    for t in etypes:
        data += "&media="+t

    req = urllib2.Request(url_2, data)
    rsp = urllib2.urlopen(req)
    content = rsp.read()

    table = []
    entry = []
    for line in content.split("\n"):
        if "<th" in line:
            continue
        if "cell0" in line:
            line = re.sub(r"<td[^>]*>","",line)
            line = line.replace("</td>","\t")
            line = line.strip().decode("utf8")
            entry.append(line)
            if len(entry)==3:
                table.append([entry[1], entry[0], entry[2]])
                entry = []

    return table

def fetch_result( tango ):
    #Open the DB or create it if it doesn't exist
    conn = sqlite3.connect("sentences.db")
    cursor = conn.cursor()
    cursor.execute("""CREATE TABLE IF NOT EXISTS sentences (tango text, lc text, rc text)""")

    #Check whether we already have the entries for this word
    # stored in the local DB
    sql = "SELECT * FROM sentences WHERE tango=?"
    cursor.execute(sql, [(tango.decode("utf8"))])
    sentences = cursor.fetchall()
    if len(sentences)>0:
        return sentences
    else:
        #Didn't find anything.  Try and fetch from online
        sentences = retrieve_response( tango )

        #Finally insert whatever we found online into the local
        # DB so we don't have to waste time and resources fetching
        # it again later on during review
        for sent in sentences:
            sql = "INSERT INTO sentences VALUES ('%s', '%s', '%s' )"
            sql = sql % ( tango.decode("utf8"), sent[1], sent[2] )
            cursor.execute( sql )
            conn.commit( )
        return sentences

def application(environ, start_response):
    """
     Purloined from:
        http://webpython.codepoint.net/wsgi_request_parsing_get
     and modified for the purpose of this app.
    """

    # Returns a dictionary containing lists as values.
    d = parse_qs(environ['QUERY_STRING'])

    # In this idiom you must issue a list containing a default value.
    tango = d.get('tango', [''])[0]
    table = table_header
    if tango:
        sentences = fetch_result( tango )
        for sent in sentences:
            row = "<tr><td class='cell01'>"+sent[1].encode("utf8")+\
                "</td><td class='cell02'>"+sent[0].encode("utf8")+\
                "</td><td class='cell03'>"+sent[2].encode("utf8")+"</td></tr>"
            table = table+row
        table = table + "</table>"
    response_body = html % ( table )
    status = '200 OK'

    # Now content type is text/html
    response_headers = [('Content-Type', 'text/html'),
                        ('Content-Length', str(len(response_body)))]
    start_response(status, response_headers)

    return [response_body]

if __name__=="__main__":
    import sys, argparse

    parser = argparse.ArgumentParser(
        description="""Simple standalone WSGI script for
retrieving example sentences from the Balanced Corpus of
Contemporary Written Japanese.""" )
    parser.add_argument('--host', '-i', help="host name to use.  Default to localhost.", default="localhost" )
    parser.add_argument('--port', '-p', help="port to use. Default to '8000'.", default=80, type=int )
    parser.add_argument('--verbose', '-v', help="Verbose mode.", default=False, action="store_true" )
    args = parser.parse_args()
    if args.verbose==True:
        for k,v in args.__dict__.items():
            print k,"=",v

    httpd = make_server('localhost', 8000, application)
    httpd.serve_forever( )
	#!/usr/bin/python
	# -- mode: python; coding: utf-8 --
	from wsgiref.simple_server import make_server
	from cgi import parse_qs, escape
	import urllib, urllib2, cookielib, sqlite3, re

	html = """
	<html>
	<head>
	<style type="text/css">
	/* form css from: http://webdesignerwall.com/tutorials/beautiful-css3-search-form */
	.searchform {
	display: inline-block;
	zoom: 1;
	border: solid 1px #d2d2d2;
	padding: 3px 5px;
	box-shadow: 0 1px 0px rgba(0,0,0,.1);
	background: #f1f1f1;
	background: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#ededed));
	background: -moz-linear-gradient(top, #fff, #ededed);
	border-radius: 2em;
	filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie7 */

	-webkit-border-radius: 2em;
	-moz-border-radius: 2em;
	-webkit-box-shadow: 0 1px 0px rgba(0,0,0,.1);
	-moz-box-shadow: 0 1px 0px rgba(0,0,0,.1);
	-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie8 */

	*display: inline;
	}

	.searchform input {
	font: normal 12px Arial, Helvetica, sans-serif;
	}

	.searchform .searchfield {
	background: #fff;
	padding: 6px 6px 6px 8px;
	width: 202px;
	border: solid 1px #bcbbbb;
	outline: none;

	-webkit-border-radius: 2em;
	-moz-border-radius: 2em;
	border-radius: 2em;

	-moz-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
	-webkit-box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
	box-shadow: inset 0 1px 2px rgba(0,0,0,.2);
	}

	.searchform .searchbutton {
	color: #fff;
	border: solid 1px #494949;
	font-size: 11px;
	height: 27px;
	width: 57px;
	text-shadow: 0 1px 1px rgba(0,0,0,.6);

	-webkit-border-radius: 2em;
	-moz-border-radius: 2em;
	border-radius: 2em;

	background: #5f5f5f;
	background: -webkit-gradient(linear, left top, left bottom, from(#9e9e9e), to(#454545));
	background: -moz-linear-gradient(top, #9e9e9e, #454545);
	filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie7 */
	-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie8 */
	}

	td.cell01 { width: 250px; border-bottom: 2px solid green; }
	td.cell02 { width: 100px; font-weight: bold; color: navy; text-align: center; border-bottom: 2px solid green; }
	td.cell03 { width: 250px; border-bottom: 2px solid green; }
	</style>
	</head>

	<body>
	<div id="searchdiv" style="padding-left: 30px; padding-top: 20px;">
	<p>Enter a Japanese word that you would like to find example sentences for.</p>
	<div style="font-size: .8em;">
	<p>Intended for study purposes and individual use only. <br />
	See the official <a href="http://www.kotonoha.gr.jp/shonagon/">BCCWJ</a> website for details.</p>
	</div>
	<form class="searchform" method="get" action="parsing_get.wsgi">
	<input
	type="text" name="tango" class="searchfield"
	value=" 単語を入力して下さい"
	onfocus="if(this.value==this.defaultValue)this.value='';"
	onblur="if(this.value=='')this.value=.this.defaultValue;" >
	<input type="submit" value="検索する" class="searchbutton">
	</form>
	</div>
	<div style="padding-left: 20px;">
	%s
	</div>
	</body>
	</html>"""

	table_header = """<table>
	<tr>
	<th class="sample cell01 reversetext">前文脈</th>
	<th class="sample cell02 nosort">検索文字列</th>
	<th class="sample cell03 text">後文脈</th>
	</tr>"""

	def retrieve_response( tango ):
	"""
	Retrieve the table of example sentences for the
	desired vocabulary item. Searches and scrapes the
	results from the
	Balanced Corpus of Contemporary Written Japanese:
	http://www.kotonoha.gr.jp/shonagon
	"""

	cookie_jar = cookielib.CookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
	opener.addheaders.append(('User-agent', 'Mozilla/5.0'))
	opener.addheaders.append(('Referer','http://www.kotonoha.gr.jp/shonagon/search_form'))

	urllib2.install_opener(opener)

	## acquire cookie
	url_1 = 'http://www.kotonoha.gr.jp/shonagon/search_form'
	req = urllib2.Request(url_1)
	rsp = urllib2.urlopen(req)

	# do POST
	url_2 = 'http://www.kotonoha.gr.jp/shonagon/search_result'
	# skip '書箱' and '韻文'
	types = [
	'雑誌','新聞','白書',
	'教科書','広報紙','Yahoo!知恵袋',
	'Yahoo!ブログ','法律','国会会議録'
	]
	etypes = [ urllib.quote_plus(x) for x in types ]

	values = dict(
	query_string=tango,
	lcontext_regex='',
	rcontext_regex='',
	entire_period='1',
	)

	data = urllib.urlencode(values)
	for t in etypes:
	data += "&media="+t

	req = urllib2.Request(url_2, data)
	rsp = urllib2.urlopen(req)
	content = rsp.read()

	table = []
	entry = []
	for line in content.split("\n"):
	if "<th" in line:
	continue
	if "cell0" in line:
	line = re.sub(r"<td[^>]*>","",line)
	line = line.replace("</td>","\t")
	line = line.strip().decode("utf8")
	entry.append(line)
	if len(entry)==3:
	table.append([entry[1], entry[0], entry[2]])
	entry = []

	return table

	def fetch_result( tango ):
	#Open the DB or create it if it doesn't exist
	conn = sqlite3.connect("sentences.db")
	cursor = conn.cursor()
	cursor.execute("""CREATE TABLE IF NOT EXISTS sentences (tango text, lc text, rc text)""")

	#Check whether we already have the entries for this word
	# stored in the local DB
	sql = "SELECT * FROM sentences WHERE tango=?"
	cursor.execute(sql, [(tango.decode("utf8"))])
	sentences = cursor.fetchall()
	if len(sentences)>0:
	return sentences
	else:
	#Didn't find anything. Try and fetch from online
	sentences = retrieve_response( tango )

	#Finally insert whatever we found online into the local
	# DB so we don't have to waste time and resources fetching
	# it again later on during review
	for sent in sentences:
	sql = "INSERT INTO sentences VALUES ('%s', '%s', '%s' )"
	sql = sql % ( tango.decode("utf8"), sent[1], sent[2] )
	cursor.execute( sql )
	conn.commit( )
	return sentences

	def application(environ, start_response):
	"""
	Purloined from:
	http://webpython.codepoint.net/wsgi_request_parsing_get
	and modified for the purpose of this app.
	"""

	# Returns a dictionary containing lists as values.
	d = parse_qs(environ['QUERY_STRING'])

	# In this idiom you must issue a list containing a default value.
	tango = d.get('tango', [''])[0]
	table = table_header
	if tango:
	sentences = fetch_result( tango )
	for sent in sentences:
	row = "<tr><td class='cell01'>"+sent[1].encode("utf8")+\
	"</td><td class='cell02'>"+sent[0].encode("utf8")+\
	"</td><td class='cell03'>"+sent[2].encode("utf8")+"</td></tr>"
	table = table+row
	table = table + "</table>"
	response_body = html % ( table )
	status = '200 OK'

	# Now content type is text/html
	response_headers = [('Content-Type', 'text/html'),
	('Content-Length', str(len(response_body)))]
	start_response(status, response_headers)

	return [response_body]

	if __name__=="__main__":
	import sys, argparse

	parser = argparse.ArgumentParser(
	description="""Simple standalone WSGI script for
	retrieving example sentences from the Balanced Corpus of
	Contemporary Written Japanese.""" )
	parser.add_argument('--host', '-i', help="host name to use. Default to localhost.", default="localhost" )
	parser.add_argument('--port', '-p', help="port to use. Default to '8000'.", default=80, type=int )
	parser.add_argument('--verbose', '-v', help="Verbose mode.", default=False, action="store_true" )
	args = parser.parse_args()
	if args.verbose==True:
	for k,v in args.__dict__.items():
	print k,"=",v

	httpd = make_server('localhost', 8000, application)
	httpd.serve_forever( )