Created
August 18, 2016 04:24
-
-
Save hardik-vala/335b7e4f32622a13a95bc33e821c4fe5 to your computer and use it in GitHub Desktop.
Simple script to translate each (English) token in a text file to its WordNet synset (using the NLTK API)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Translates each token in a text file to its WordNet synset. | |
@author: Hardik | |
""" | |
import argparse | |
import logging | |
import os | |
import re | |
import string | |
from nltk.corpus import stopwords | |
from nltk.corpus import wordnet as wn | |
# Configure logging. | |
log_format = "%(levelname)s: [%(asctime)s] %(message)s" | |
logging.basicConfig(format=log_format, level=logging.INFO) | |
def main(): | |
parser_description = ("Translates each token in a text file to its " | |
"WordNet synset.") | |
parser = argparse.ArgumentParser(description=parser_description) | |
parser.add_argument('in_path', help="path to input") | |
parser.add_argument('out_path', help="path to output") | |
parser.add_argument('-d', '--dir', action='store_true', | |
help="input and output correspond to directories") | |
args = parser.parse_args() | |
stop = set(stopwords.words('english')) | |
punc = set(string.punctuation) | |
punc_regex = '[' + re.escape(string.punctuation) + ']' | |
# If run in directory mode, create the output directory if it doesn't | |
# already exist. | |
if args.dir and not os.path.exists(args.out_path): | |
logging.info("Creating directory %s..." % args.out_path) | |
os.makedirs(args.out_path) | |
# Translates a word to its WordNet synset, by outputting the most prominent | |
# lemma corresponding to it's most prominent synset. | |
def translate_word(word): | |
synsets = wn.synsets(word) | |
if len(synsets) > 0: | |
return synsets[0].lemma_names()[0] | |
return None | |
# Translates a single file. | |
def translate_file(in_fpath, out_fpath): | |
logging.info("Mapping %s..." % in_fpath) | |
translated_lines = [] | |
with open(in_fpath) as fin: | |
for line in fin: | |
translated_tokens = [] | |
for token in line.strip().split(): | |
# Lowercase token and strip all punctuation. | |
token = re.sub(punc_regex, "", token.lower()) | |
if token in stop or token in punc: | |
continue | |
translated_token = translate_word(token) | |
if translated_token is not None: | |
translated_tokens.append(translated_token) | |
translated_lines.append(' '.join(translated_tokens)) | |
translated_text = '\n'.join(translated_lines) | |
logging.info("Done! Saving translation to %s..." % out_fpath) | |
with open(out_fpath, 'w') as fout: | |
fout.write(translated_text.encode('utf-8')) | |
# Run in directory mode, where input and output paths given correspond to | |
# directories. | |
if args.dir: | |
# Loop through input directory files. | |
for fname in os.listdir(args.in_path): | |
# Ignore hidden files. | |
if fname.startswith('.'): | |
continue | |
in_fpath = os.path.join(args.in_path, fname) | |
out_fpath = os.path.join(args.out_path, fname) | |
translate_file(in_fpath, out_fpath) | |
else: | |
translate_file(args.in_path, args.out_path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment