Skip to content

Instantly share code, notes, and snippets.

@hardik-vala
Created August 18, 2016 04:24
Show Gist options
  • Save hardik-vala/335b7e4f32622a13a95bc33e821c4fe5 to your computer and use it in GitHub Desktop.
Save hardik-vala/335b7e4f32622a13a95bc33e821c4fe5 to your computer and use it in GitHub Desktop.
Simple script to translate each (English) token in a text file to its WordNet synset (using the NLTK API)
"""
Translates each token in a text file to its WordNet synset.
@author: Hardik
"""
import argparse
import logging
import os
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
# Configure logging.
log_format = "%(levelname)s: [%(asctime)s] %(message)s"
logging.basicConfig(format=log_format, level=logging.INFO)
def main():
parser_description = ("Translates each token in a text file to its "
"WordNet synset.")
parser = argparse.ArgumentParser(description=parser_description)
parser.add_argument('in_path', help="path to input")
parser.add_argument('out_path', help="path to output")
parser.add_argument('-d', '--dir', action='store_true',
help="input and output correspond to directories")
args = parser.parse_args()
stop = set(stopwords.words('english'))
punc = set(string.punctuation)
punc_regex = '[' + re.escape(string.punctuation) + ']'
# If run in directory mode, create the output directory if it doesn't
# already exist.
if args.dir and not os.path.exists(args.out_path):
logging.info("Creating directory %s..." % args.out_path)
os.makedirs(args.out_path)
# Translates a word to its WordNet synset, by outputting the most prominent
# lemma corresponding to it's most prominent synset.
def translate_word(word):
synsets = wn.synsets(word)
if len(synsets) > 0:
return synsets[0].lemma_names()[0]
return None
# Translates a single file.
def translate_file(in_fpath, out_fpath):
logging.info("Mapping %s..." % in_fpath)
translated_lines = []
with open(in_fpath) as fin:
for line in fin:
translated_tokens = []
for token in line.strip().split():
# Lowercase token and strip all punctuation.
token = re.sub(punc_regex, "", token.lower())
if token in stop or token in punc:
continue
translated_token = translate_word(token)
if translated_token is not None:
translated_tokens.append(translated_token)
translated_lines.append(' '.join(translated_tokens))
translated_text = '\n'.join(translated_lines)
logging.info("Done! Saving translation to %s..." % out_fpath)
with open(out_fpath, 'w') as fout:
fout.write(translated_text.encode('utf-8'))
# Run in directory mode, where input and output paths given correspond to
# directories.
if args.dir:
# Loop through input directory files.
for fname in os.listdir(args.in_path):
# Ignore hidden files.
if fname.startswith('.'):
continue
in_fpath = os.path.join(args.in_path, fname)
out_fpath = os.path.join(args.out_path, fname)
translate_file(in_fpath, out_fpath)
else:
translate_file(args.in_path, args.out_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment