Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active May 10, 2019 10:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikegami-yukino/2a5ed3fd3ccde2938f020e47a8e4c9af to your computer and use it in GitHub Desktop.
Save ikegami-yukino/2a5ed3fd3ccde2938f020e47a8e4c9af to your computer and use it in GitHub Desktop.
SentiWordNet を日本語化する
import re
import sqlite3
import time
import requests
DB_PATH = 'wnjpn.db'
SWN_PATH = 'SentiWordNet_3.0.0_20130122.txt'
URL = 'https://script.google.com/macros/s/Please_write_here/exec?text=%s&source=en&target=ja'
RESULT_PATH = 'result.csv'
re_sentence = re.compile('"([^"]+)"')
def fetch_jp_lemma(synset, cursor):
jp_lemma = []
cursor.execute("SELECT wordid FROM sense WHERE synset = '%s' AND lang != 'eng'" % (synset))
for x in cursor.fetchall():
cursor.execute("SELECT lemma FROM word WHERE wordid = '%s' AND lang != 'eng'" % (x[0]))
for y in cursor.fetchall():
jp_lemma.append(y[0])
return jp_lemma
def translate(sentence):
time.sleep(5) # あまり短くすると利用制限に引っかかるので注意
return requests.get(URL % sentence).content.decode('utf8')
with open(SWN_PATH) as fd, open(RESULT_PATH, 'w') as rfd, sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
for line in fd.read().splitlines():
if line.startswith('#'):
continue
POS, ID, PosScore, NegScore, SynsetTerms, Gloss = line.split('\t')
synset_id = '%s-%s' % (ID, POS)
jp_lemma = fetch_jp_lemma(synset_id, cursor)
sentences = [translate(sentence) for sentence in re_sentence.findall(Gloss)]
rfd.write('%s\t%s\t%s\t%s\t%s\n' % (synset_id, ','.join(jp_lemma) if jp_lemma else SynsetTerms,
PosScore, NegScore, ','.join(sentences)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment