Last active
August 29, 2015 14:23
-
-
Save Kaljurand/5fc2adfe03735d6219f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import division, unicode_literals, print_function | |
import sys | |
import re | |
import argparse | |
from collections import * | |
def is_lemma_form(pos, x): | |
return x['partofspeech'] == pos and x['form'] == 'sg n' | |
funs_filter = { | |
'S': lambda x : is_lemma_form('S', x), | |
'A': lambda x : is_lemma_form('A', x) | |
} | |
def select_splits(f, a): | |
return [ x['lemma_tokens'] for x in filter(f, a[0]['analysis']) ] | |
def read_data(fn): | |
with open(fn, 'r') as f: | |
return f.read() | |
def remove_symbols(a): | |
return re.sub(r'[]?=+]', '', a).encode('utf8') | |
def get_args(): | |
p = argparse.ArgumentParser(description='') | |
p.add_argument('--data', type=str, action='store', dest='data', required=True) | |
p.add_argument('--order', type=int, action='store', default=4) | |
p.add_argument('--tokenize', action='store_true') | |
p.add_argument('--pos', type=str, action='store', dest='pos', default='S', help='part of speech, one of ' + str(funs_filter.keys())) | |
p.add_argument('-v', '--version', action='version', version='%(prog)s v0.0.1') | |
return p.parse_args() | |
def main(): | |
args = get_args() | |
for line in sys.stdin: | |
line = line.strip() | |
try: | |
print("Error: parts do not match the input: {0} {1}".format(line, parts), file=sys.stderr) | |
except IndexError as e: | |
pass | |
except: | |
print("Error:" + str(sys.exc_info()[0]), file=sys.stderr) | |
finally: | |
print(line) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment