Skip to content

Instantly share code, notes, and snippets.

@Mindful
Last active February 18, 2018 23:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mindful/0c8f8f76c0677a1b8fb1541a8c7d903d to your computer and use it in GitHub Desktop.
Save Mindful/0c8f8f76c0677a1b8fb1541a8c7d903d to your computer and use it in GitHub Desktop.
Katakana word frequency
import csv
#Used for https://japanese.meta.stackexchange.com/questions/1778/list-of-katakana-words-by-frequency/1809
#Intended to filter this data: http://pj.ninjal.ac.jp/corpus_center/bccwj/en/freq-list.html
class Word:
def __init__(self, row):
self.row = row
self.frequency = row[0]
self.text = row[2]
#http://www.unicode.org/charts/PDF/U30A0.pdf
katakana_punctuation = {'ー', '・', '゠'}
def char_is_katakana_exclude_punctuation(char):
return char_is_katakana(char) and char not in katakana_punctuation
def char_is_katakana(char):
return int('30A0', 16) <= ord(char) <= int('30FF', 16)
def contains_katakana(string):
for char in string:
if char_is_katakana_exclude_punctuation(char):
return True
return False
def entirely_katakana(string):
for char in string:
if not char_is_katakana(char):
return False
return True
def main():
has_katakana_list = []
all_katakana_list = []
first_row = True
with open('full_jp_wordlist.tsv', newline='') as csvfile:
wordlist_reader = csv.reader(csvfile, delimiter='\t')
for row in wordlist_reader:
if first_row:
first_row = False
has_katakana_list.append(row)
all_katakana_list.append(row)
else:
word = Word(row)
if entirely_katakana(word.text):
all_katakana_list.append(word.row)
has_katakana_list.append(word.row)
elif contains_katakana(word.text):
has_katakana_list.append(word.row)
with open('all_katakana.tsv', 'w') as all_katakana:
wr = csv.writer(all_katakana, quoting=csv.QUOTE_NONE, delimiter='\t')
wr.writerows(all_katakana_list)
with open('has_katakana.tsv', 'w') as has_katakana:
wr = csv.writer(has_katakana, quoting=csv.QUOTE_NONE, delimiter='\t')
wr.writerows(has_katakana_list)
print("Finished writing output")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment