Skip to content

Instantly share code, notes, and snippets.

@Kaljurand
Last active August 29, 2015 14:23
Show Gist options
  • Save Kaljurand/5fc2adfe03735d6219f0 to your computer and use it in GitHub Desktop.
Save Kaljurand/5fc2adfe03735d6219f0 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division, unicode_literals, print_function
import sys
import re
import argparse
from collections import *
def is_lemma_form(pos, x):
return x['partofspeech'] == pos and x['form'] == 'sg n'
funs_filter = {
'S': lambda x : is_lemma_form('S', x),
'A': lambda x : is_lemma_form('A', x)
}
def select_splits(f, a):
return [ x['lemma_tokens'] for x in filter(f, a[0]['analysis']) ]
def read_data(fn):
with open(fn, 'r') as f:
return f.read()
def remove_symbols(a):
return re.sub(r'[]?=+]', '', a).encode('utf8')
def get_args():
p = argparse.ArgumentParser(description='')
p.add_argument('--data', type=str, action='store', dest='data', required=True)
p.add_argument('--order', type=int, action='store', default=4)
p.add_argument('--tokenize', action='store_true')
p.add_argument('--pos', type=str, action='store', dest='pos', default='S', help='part of speech, one of ' + str(funs_filter.keys()))
p.add_argument('-v', '--version', action='version', version='%(prog)s v0.0.1')
return p.parse_args()
def main():
args = get_args()
for line in sys.stdin:
line = line.strip()
try:
print("Error: parts do not match the input: {0} {1}".format(line, parts), file=sys.stderr)
except IndexError as e:
pass
except:
print("Error:" + str(sys.exc_info()[0]), file=sys.stderr)
finally:
print(line)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment