Created
January 21, 2019 20:18
-
-
Save Khouderchah-Alex/ebfc75ddbac177d0aef189da19b18920 to your computer and use it in GitHub Desktop.
Script to add Chinese words from an input list to a word file when all the characters in that word are known.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script to add Chinese words from the input list to the word file when all the | |
characters in that word are known (i.e. exist in the character file). | |
The input list is currently assumed to be the output from the ArchChinese-Scraper | |
Chrome extension (see https://github.com/Khouderchah-Alex/ArchChinese-Scraper). | |
Note that in the current form, the character file must contain both traditional | |
and simplified characters. If one is only studying traditional or simplified | |
characters, the word_regex variable can be modified such the word capture group | |
only contains the traditional/simplified version of the word. | |
An example use: | |
`python chinese_word_selector.py < word_list` | |
""" | |
import argparse | |
import codecs | |
import fileinput | |
import os | |
import re | |
# Regular expression in which the first capture group represents the character | |
# of that line. | |
character_regex = r'(.)' | |
# Regular expression in which the first capture group contains the words | |
# characters and optionally any amount of /. | |
word_regex = r'(.*?)[ ;\(\[]' | |
def GetOpts(): | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('--character_filename', type=str, nargs='?', | |
default='~/Documents/Chinese_Characters.txt', | |
help='Input file in which each line has a single character.') | |
parser.add_argument('--word_filename', type=str, nargs='?', | |
default='~/Documents/Chinese_Words.txt', | |
help='Output file in which to add words to.') | |
return parser.parse_args() | |
def main(): | |
opts = GetOpts() | |
# Get lines from input. | |
word_lines = [] | |
for word_line in fileinput.input(): | |
word_lines.append(word_line.decode('utf-8')) | |
# Add new characters to character file. | |
f = open(os.path.expanduser(opts.character_filename), 'a') | |
f.write(word_lines[0].encode('utf8')) | |
if '(simpl)' in word_lines[0].split(';')[1]: | |
word_lines.pop(0) | |
f.write(word_lines[0].encode('utf8')) | |
word_lines.pop(0) | |
f.close() | |
# Get known characters. | |
character_input = codecs.open( | |
os.path.expanduser(opts.character_filename), encoding='utf-8') | |
characters = set() | |
for character_line in character_input: | |
characters.add(re.match(character_regex, character_line).groups()[0]) | |
characters.add('/') | |
# Get all known words. | |
known_words = set() | |
word_filename = os.path.expanduser(opts.word_filename) | |
if os.path.isfile(word_filename): | |
with open(word_filename, 'r') as word_list: | |
for word_line in word_list: | |
word_line = word_line.decode('utf-8') | |
known_words.add(re.match(word_regex, word_line).groups()[0]) | |
# Collect unknown words with all known characters. | |
add_lines = [] | |
for word_line in word_lines: | |
word = re.match(word_regex, word_line).groups()[0] | |
word_set = set(word) | |
if word_set.issubset(characters) and word not in known_words: | |
add_lines.append(word_line) | |
f = open(word_filename, 'a') | |
for addition in add_lines: | |
f.write(addition.encode('utf8')) | |
f.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment