Khouderchah-Alex/chinese_word_selector.py

## chinese_word_selector.py

"""Script to add Chinese words from the input list to the word file when all the
characters in that word are known (i.e. exist in the character file).

The input list is currently assumed to be the output from the ArchChinese-Scraper
Chrome extension (see https://github.com/Khouderchah-Alex/ArchChinese-Scraper).

Note that in the current form, the character file must contain both traditional
and simplified characters. If one is only studying traditional or simplified
characters, the word_regex variable can be modified such the word capture group
only contains the traditional/simplified version of the word.

An example use:
   `python chinese_word_selector.py < word_list`
"""

import argparse
import codecs
import fileinput
import os
import re


# Regular expression in which the first capture group represents the character
# of that line.
character_regex = r'(.)'
# Regular expression in which the first capture group contains the words
# characters and optionally any amount of /.
word_regex = r'(.*?)[ ;\(\[]'


def GetOpts():
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument('--character_filename', type=str, nargs='?',
                      default='~/Documents/Chinese_Characters.txt',
                      help='Input file in which each line has a single character.')
  parser.add_argument('--word_filename', type=str, nargs='?',
                      default='~/Documents/Chinese_Words.txt',
                      help='Output file in which to add words to.')
  return parser.parse_args()


def main():
  opts = GetOpts()

  # Get lines from input.
  word_lines = []
  for word_line in fileinput.input():
    word_lines.append(word_line.decode('utf-8'))

  # Add new characters to character file.
  f = open(os.path.expanduser(opts.character_filename), 'a')
  f.write(word_lines[0].encode('utf8'))
  if '(simpl)' in word_lines[0].split(';')[1]:
    word_lines.pop(0)
    f.write(word_lines[0].encode('utf8'))
  word_lines.pop(0)
  f.close()

  # Get known characters.
  character_input = codecs.open(
      os.path.expanduser(opts.character_filename), encoding='utf-8')
  characters = set()
  for character_line in character_input:
    characters.add(re.match(character_regex, character_line).groups()[0])
  characters.add('/')

  # Get all known words.
  known_words = set()
  word_filename = os.path.expanduser(opts.word_filename)
  if os.path.isfile(word_filename):
    with open(word_filename, 'r') as word_list:
      for word_line in word_list:
        word_line = word_line.decode('utf-8')
        known_words.add(re.match(word_regex, word_line).groups()[0])

  # Collect unknown words with all known characters.
  add_lines = []
  for word_line in word_lines:
    word = re.match(word_regex, word_line).groups()[0]
    word_set = set(word)
    if word_set.issubset(characters) and word not in known_words:
      add_lines.append(word_line)

  f = open(word_filename, 'a')
  for addition in add_lines:
    f.write(addition.encode('utf8'))
  f.close()


if __name__ == "__main__":
  main()

	"""Script to add Chinese words from the input list to the word file when all the
	characters in that word are known (i.e. exist in the character file).

	The input list is currently assumed to be the output from the ArchChinese-Scraper
	Chrome extension (see https://github.com/Khouderchah-Alex/ArchChinese-Scraper).

	Note that in the current form, the character file must contain both traditional
	and simplified characters. If one is only studying traditional or simplified
	characters, the word_regex variable can be modified such the word capture group
	only contains the traditional/simplified version of the word.

	An example use:
	`python chinese_word_selector.py < word_list`
	"""

	import argparse
	import codecs
	import fileinput
	import os
	import re


	# Regular expression in which the first capture group represents the character
	# of that line.
	character_regex = r'(.)'
	# Regular expression in which the first capture group contains the words
	# characters and optionally any amount of /.
	word_regex = r'(.*?)[ ;\(\[]'


	def GetOpts():
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('--character_filename', type=str, nargs='?',
	default='~/Documents/Chinese_Characters.txt',
	help='Input file in which each line has a single character.')
	parser.add_argument('--word_filename', type=str, nargs='?',
	default='~/Documents/Chinese_Words.txt',
	help='Output file in which to add words to.')
	return parser.parse_args()


	def main():
	opts = GetOpts()

	# Get lines from input.
	word_lines = []
	for word_line in fileinput.input():
	word_lines.append(word_line.decode('utf-8'))

	# Add new characters to character file.
	f = open(os.path.expanduser(opts.character_filename), 'a')
	f.write(word_lines[0].encode('utf8'))
	if '(simpl)' in word_lines[0].split(';')[1]:
	word_lines.pop(0)
	f.write(word_lines[0].encode('utf8'))
	word_lines.pop(0)
	f.close()

	# Get known characters.
	character_input = codecs.open(
	os.path.expanduser(opts.character_filename), encoding='utf-8')
	characters = set()
	for character_line in character_input:
	characters.add(re.match(character_regex, character_line).groups()[0])
	characters.add('/')

	# Get all known words.
	known_words = set()
	word_filename = os.path.expanduser(opts.word_filename)
	if os.path.isfile(word_filename):
	with open(word_filename, 'r') as word_list:
	for word_line in word_list:
	word_line = word_line.decode('utf-8')
	known_words.add(re.match(word_regex, word_line).groups()[0])

	# Collect unknown words with all known characters.
	add_lines = []
	for word_line in word_lines:
	word = re.match(word_regex, word_line).groups()[0]
	word_set = set(word)
	if word_set.issubset(characters) and word not in known_words:
	add_lines.append(word_line)

	f = open(word_filename, 'a')
	for addition in add_lines:
	f.write(addition.encode('utf8'))
	f.close()


	if __name__ == "__main__":
	main()