Last active
February 18, 2018 23:38
-
-
Save Mindful/0c8f8f76c0677a1b8fb1541a8c7d903d to your computer and use it in GitHub Desktop.
Katakana word frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
#Used for https://japanese.meta.stackexchange.com/questions/1778/list-of-katakana-words-by-frequency/1809 | |
#Intended to filter this data: http://pj.ninjal.ac.jp/corpus_center/bccwj/en/freq-list.html | |
class Word: | |
def __init__(self, row): | |
self.row = row | |
self.frequency = row[0] | |
self.text = row[2] | |
#http://www.unicode.org/charts/PDF/U30A0.pdf | |
katakana_punctuation = {'ー', '・', '゠'} | |
def char_is_katakana_exclude_punctuation(char): | |
return char_is_katakana(char) and char not in katakana_punctuation | |
def char_is_katakana(char): | |
return int('30A0', 16) <= ord(char) <= int('30FF', 16) | |
def contains_katakana(string): | |
for char in string: | |
if char_is_katakana_exclude_punctuation(char): | |
return True | |
return False | |
def entirely_katakana(string): | |
for char in string: | |
if not char_is_katakana(char): | |
return False | |
return True | |
def main(): | |
has_katakana_list = [] | |
all_katakana_list = [] | |
first_row = True | |
with open('full_jp_wordlist.tsv', newline='') as csvfile: | |
wordlist_reader = csv.reader(csvfile, delimiter='\t') | |
for row in wordlist_reader: | |
if first_row: | |
first_row = False | |
has_katakana_list.append(row) | |
all_katakana_list.append(row) | |
else: | |
word = Word(row) | |
if entirely_katakana(word.text): | |
all_katakana_list.append(word.row) | |
has_katakana_list.append(word.row) | |
elif contains_katakana(word.text): | |
has_katakana_list.append(word.row) | |
with open('all_katakana.tsv', 'w') as all_katakana: | |
wr = csv.writer(all_katakana, quoting=csv.QUOTE_NONE, delimiter='\t') | |
wr.writerows(all_katakana_list) | |
with open('has_katakana.tsv', 'w') as has_katakana: | |
wr = csv.writer(has_katakana, quoting=csv.QUOTE_NONE, delimiter='\t') | |
wr.writerows(has_katakana_list) | |
print("Finished writing output") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment