Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@neubig
Created June 25, 2019 18:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neubig/52b9354b0dd97d08201fb95851435653 to your computer and use it in GitHub Desktop.
Save neubig/52b9354b0dd97d08201fb95851435653 to your computer and use it in GitHub Desktop.
A script to identify Japanese pronouns
import sys
import re
from collections import defaultdict
# This is a script to identify pronouns in Japanese
# It requires data segmented by KyTea (http://www.phontron.com/kytea/)
#
# If you have raw Japanese text (with no spaces), use this script like:
# cat japanese.txt | kytea | python identify_japanese_pronouns.py > japanese_with_pronouns.txt
#
# If you have tokenized Japanese text (with lots of spaces), use this script like:
# cat segmented_japanese.txt | kytea -in tok | python identify_japanese_pronouns.py > japanese_with_pronouns.txt
#
# The result will be segmented, where each pronoun X will be surrounded by <<<X>>>_{1,2,3} indicating the
# person of the pronoun.
#
# This heavily references Wikipedia:
# https://ja.wikipedia.org/wiki/日本語の一人称代名詞
# https://ja.wikipedia.org/wiki/日本語の二人称代名詞
pronoun_map = {
# "Standard" first person
'私': 1,
'わたし': 1,
'僕': 1,
'ぼく': 1,
'俺': 1,
'おれ': 1,
'我々': 1, # 1st person plural
# Less standard first person
'わたくし': 1,
'あたし': 1,
'あたくし': 1,
'わし': 1,
'わて': 1,
'わい': 1,
'うち': 1,
'おいら': 1,
'おい': 1,
'我輩': 1,
'吾輩': 1,
'我が輩': 1,
'吾が輩': 1,
# "Standard" second person
'あなた': 2,
'あんた': 2,
'お前': 2,
'君': 2,
# "Standard" third person
'彼': 3, # can also mean "boyfriend"
'かれ': 3, # can also mean "boyfriend"
'彼女': 3, # can also mean "girlfriend"
'かのじょ': 3, # can also mean "girlfriend"
'あいつ': 3,
}
missed_pronoun = defaultdict(lambda: 0)
for line in sys.stdin:
line = line.replace('\ ', ' ') # remove space tokens
words = line.strip().split(' ')
out = []
for w in words:
t = w.split('/')
if len(t) != 3:
print(f'malformed tag {w}, skipping', file=sys.stderr)
out.append(w)
elif t[1] == '代名詞':
if t[0] in pronoun_map:
out.append(f'<<<{t[0]}>>>_{pronoun_map[t[0]]}')
else:
missed_pronoun[t[0]] += 1
out.append(t[0])
else:
out.append(t[0])
print(' '.join(out))
# # Print out missed pronouns to check to make sure none are missed
# for k, v in sorted(missed_pronoun.items(), key=lambda x: -x[1]):
# print(f'MISSED: {k}\t{v}', file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment