Skip to content

Instantly share code, notes, and snippets.

@Khouderchah-Alex
Created January 21, 2019 20:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Khouderchah-Alex/ebfc75ddbac177d0aef189da19b18920 to your computer and use it in GitHub Desktop.
Save Khouderchah-Alex/ebfc75ddbac177d0aef189da19b18920 to your computer and use it in GitHub Desktop.
Script to add Chinese words from an input list to a word file when all the characters in that word are known.
"""Script to add Chinese words from the input list to the word file when all the
characters in that word are known (i.e. exist in the character file).
The input list is currently assumed to be the output from the ArchChinese-Scraper
Chrome extension (see https://github.com/Khouderchah-Alex/ArchChinese-Scraper).
Note that in the current form, the character file must contain both traditional
and simplified characters. If one is only studying traditional or simplified
characters, the word_regex variable can be modified such the word capture group
only contains the traditional/simplified version of the word.
An example use:
`python chinese_word_selector.py < word_list`
"""
import argparse
import codecs
import fileinput
import os
import re
# Regular expression in which the first capture group represents the character
# of that line.
character_regex = r'(.)'
# Regular expression in which the first capture group contains the words
# characters and optionally any amount of /.
word_regex = r'(.*?)[ ;\(\[]'
def GetOpts():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--character_filename', type=str, nargs='?',
default='~/Documents/Chinese_Characters.txt',
help='Input file in which each line has a single character.')
parser.add_argument('--word_filename', type=str, nargs='?',
default='~/Documents/Chinese_Words.txt',
help='Output file in which to add words to.')
return parser.parse_args()
def main():
opts = GetOpts()
# Get lines from input.
word_lines = []
for word_line in fileinput.input():
word_lines.append(word_line.decode('utf-8'))
# Add new characters to character file.
f = open(os.path.expanduser(opts.character_filename), 'a')
f.write(word_lines[0].encode('utf8'))
if '(simpl)' in word_lines[0].split(';')[1]:
word_lines.pop(0)
f.write(word_lines[0].encode('utf8'))
word_lines.pop(0)
f.close()
# Get known characters.
character_input = codecs.open(
os.path.expanduser(opts.character_filename), encoding='utf-8')
characters = set()
for character_line in character_input:
characters.add(re.match(character_regex, character_line).groups()[0])
characters.add('/')
# Get all known words.
known_words = set()
word_filename = os.path.expanduser(opts.word_filename)
if os.path.isfile(word_filename):
with open(word_filename, 'r') as word_list:
for word_line in word_list:
word_line = word_line.decode('utf-8')
known_words.add(re.match(word_regex, word_line).groups()[0])
# Collect unknown words with all known characters.
add_lines = []
for word_line in word_lines:
word = re.match(word_regex, word_line).groups()[0]
word_set = set(word)
if word_set.issubset(characters) and word not in known_words:
add_lines.append(word_line)
f = open(word_filename, 'a')
for addition in add_lines:
f.write(addition.encode('utf8'))
f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment