Skip to content

Instantly share code, notes, and snippets.

@urialon
Created April 20, 2020 10:50
Show Gist options
  • Save urialon/7c4d129f6cb45c2cdba669a15f132fb4 to your computer and use it in GitHub Desktop.
Save urialon/7c4d129f6cb45c2cdba669a15f132fb4 to your computer and use it in GitHub Desktop.
from argparse import ArgumentParser
import nltk
nltk.download('punkt')
def process_seq(seq):
tokens = nltk.word_tokenize(seq.lower())
return tokens
def load_align(align_file):
result = {}
with open(align_file, 'r') as file:
for line in file:
line = line.strip()
parts = line.split(' ', 1)
example_id = parts[0]
target_seq = parts[1]
result[example_id] = process_seq(target_seq)
return result
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-in", "--input", dest="input_file",
help="path to input file", required=True)
parser.add_argument("-out", "--output", dest="output_file", required=True)
parser.add_argument("-al", "--align", dest="align_file", required=True)
args = parser.parse_args()
example_to_target = load_align(args.align_file)
#os.mkdir(args.output_dir)
with open(args.input_file, 'r') as infile:
with open(args.output_file, 'w') as outfile:
for i,line in enumerate(infile.readlines()):
line = line.strip()
parts = line.split(' ', 1)
exampled_id = parts[0]
contexts = parts[1]
target_seq = example_to_target[exampled_id]
target_seq_string = '|'.join(target_seq)
outfile.write('%s %s\n' % (target_seq_string, contexts))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment