hardik-vala/translate_to_eng_synsets.py

## translate_to_eng_synsets.py
"""
Translates each token in a text file to its WordNet synset.

@author: Hardik
"""

import argparse
import logging
import os
import re
import string

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn


# Configure logging.
log_format = "%(levelname)s: [%(asctime)s] %(message)s"
logging.basicConfig(format=log_format, level=logging.INFO)


def main():
	parser_description = ("Translates each token in a text file to its "
		"WordNet synset.")
	parser = argparse.ArgumentParser(description=parser_description)

	parser.add_argument('in_path', help="path to input")
	parser.add_argument('out_path', help="path to output")

	parser.add_argument('-d', '--dir', action='store_true',
		help="input and output correspond to directories")

	args = parser.parse_args()

	stop = set(stopwords.words('english'))
	punc = set(string.punctuation)

	punc_regex = '[' + re.escape(string.punctuation) + ']'

	# If run in directory mode, create the output directory if it doesn't
	# already exist.
	if args.dir and not os.path.exists(args.out_path):
		logging.info("Creating directory %s..." % args.out_path)
		os.makedirs(args.out_path)

	# Translates a word to its WordNet synset, by outputting the most prominent
	# lemma corresponding to it's most prominent synset.
	def translate_word(word):
		synsets = wn.synsets(word)
		if len(synsets) > 0:
			return synsets[0].lemma_names()[0]

		return None

	# Translates a single file.
	def translate_file(in_fpath, out_fpath):
		logging.info("Mapping %s..." % in_fpath)

		translated_lines = []
		with open(in_fpath) as fin:
			for line in fin:
				translated_tokens = []
				for token in line.strip().split():
					# Lowercase token and strip all punctuation.
					token = re.sub(punc_regex, "", token.lower())

					if token in stop or token in punc:
						continue

					translated_token = translate_word(token)
					if translated_token is not None:
						translated_tokens.append(translated_token)

				translated_lines.append(' '.join(translated_tokens))

		translated_text = '\n'.join(translated_lines)

		logging.info("Done! Saving translation to %s..." % out_fpath)

		with open(out_fpath, 'w') as fout:
			fout.write(translated_text.encode('utf-8'))

	# Run in directory mode, where input and output paths given correspond to
	# directories.
	if args.dir:
		# Loop through input directory files.
		for fname in os.listdir(args.in_path):
			# Ignore hidden files.
			if fname.startswith('.'):
				continue

			in_fpath = os.path.join(args.in_path, fname)
			out_fpath = os.path.join(args.out_path, fname)

			translate_file(in_fpath, out_fpath)
	else:
		translate_file(args.in_path, args.out_path)


if __name__ == '__main__':
	main()
	"""
	Translates each token in a text file to its WordNet synset.

	@author: Hardik
	"""

	import argparse
	import logging
	import os
	import re
	import string

	from nltk.corpus import stopwords
	from nltk.corpus import wordnet as wn


	# Configure logging.
	log_format = "%(levelname)s: [%(asctime)s] %(message)s"
	logging.basicConfig(format=log_format, level=logging.INFO)


	def main():
	parser_description = ("Translates each token in a text file to its "
	"WordNet synset.")
	parser = argparse.ArgumentParser(description=parser_description)

	parser.add_argument('in_path', help="path to input")
	parser.add_argument('out_path', help="path to output")

	parser.add_argument('-d', '--dir', action='store_true',
	help="input and output correspond to directories")

	args = parser.parse_args()

	stop = set(stopwords.words('english'))
	punc = set(string.punctuation)

	punc_regex = '[' + re.escape(string.punctuation) + ']'

	# If run in directory mode, create the output directory if it doesn't
	# already exist.
	if args.dir and not os.path.exists(args.out_path):
	logging.info("Creating directory %s..." % args.out_path)
	os.makedirs(args.out_path)

	# Translates a word to its WordNet synset, by outputting the most prominent
	# lemma corresponding to it's most prominent synset.
	def translate_word(word):
	synsets = wn.synsets(word)
	if len(synsets) > 0:
	return synsets[0].lemma_names()[0]

	return None

	# Translates a single file.
	def translate_file(in_fpath, out_fpath):
	logging.info("Mapping %s..." % in_fpath)

	translated_lines = []
	with open(in_fpath) as fin:
	for line in fin:
	translated_tokens = []
	for token in line.strip().split():
	# Lowercase token and strip all punctuation.
	token = re.sub(punc_regex, "", token.lower())

	if token in stop or token in punc:
	continue

	translated_token = translate_word(token)
	if translated_token is not None:
	translated_tokens.append(translated_token)

	translated_lines.append(' '.join(translated_tokens))

	translated_text = '\n'.join(translated_lines)

	logging.info("Done! Saving translation to %s..." % out_fpath)

	with open(out_fpath, 'w') as fout:
	fout.write(translated_text.encode('utf-8'))

	# Run in directory mode, where input and output paths given correspond to
	# directories.
	if args.dir:
	# Loop through input directory files.
	for fname in os.listdir(args.in_path):
	# Ignore hidden files.
	if fname.startswith('.'):
	continue

	in_fpath = os.path.join(args.in_path, fname)
	out_fpath = os.path.join(args.out_path, fname)

	translate_file(in_fpath, out_fpath)
	else:
	translate_file(args.in_path, args.out_path)


	if __name__ == '__main__':
	main()