-
-
Save frreiss/bdb1defa5d7827951ae1d948346362b6 to your computer and use it in GitHub Desktop.
https://github.com/propbank/propbank-release/blob/master/docs/scripts/skeleton2conll.py patched for Python 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Get most current usage with: | |
python skeleton2conll.py --help | |
""" | |
from __future__ import with_statement | |
import codecs | |
import sys | |
import os | |
import re | |
import string | |
from collections import defaultdict | |
try: | |
from cStringIO import StringIO ## for Python 2 | |
except ImportError: | |
from io import StringIO ## for Python 3 | |
WORD_COLUMN=3 | |
LEMMA_COLUMN=6 | |
MIN_VERBOSITY = 0 | |
MED_VERBOSITY = 5 | |
MAX_VERBOSITY = 10 | |
SUPER_VERBOSITY = 15 | |
DEBUG = False | |
VERBOSITY = MAX_VERBOSITY | |
def debug(debug_object, debug_flag=DEBUG, verbosity=MAX_VERBOSITY, nl=False): | |
if((debug_flag == True) and (verbosity <= VERBOSITY)): | |
if nl: | |
trailing_char = "\n" | |
else: | |
trailing_char = "" | |
sys.stderr.write(str(debug_object) + trailing_char) | |
def warning(warning_string, verbosity=0): | |
""" print warning string depending on the value of VERBOSITY """ | |
if(verbosity <= VERBOSITY): | |
sys.stderr.write(u""" | |
-------------------------------------------------------------------------------- | |
WARNING | |
-------------------------------------------------------------------------------- | |
%s | |
-------------------------------------------------------------------------------- | |
""" % (warning_string)) | |
class abstract_open_type_table: | |
def __init__(self, a_id, data_pointer=None): | |
self.id = a_id | |
self.type_hash[self.id] += 1 | |
@classmethod | |
def write_to_db(cls, cursor): | |
for a_type in cls.type_hash.keys(): | |
insert_ignoring_dups(cls, cursor, a_type) | |
@classmethod | |
def __repr__(cls): | |
return " ".join(cls.type_hash.keys()) | |
@classmethod | |
def get_table(cls): | |
try: | |
return cls.sql_insert_statement.strip().split("\n")[0].split()[2] | |
except Exception: | |
return "unknown" | |
class lemma_type(abstract_open_type_table): | |
type_hash = defaultdict(int) | |
sql_table_name = "lemma_type" | |
sql_create_statement = \ | |
""" | |
create table lemma_type | |
( | |
id varchar(255) not null collate utf8_bin primary key | |
) | |
default character set utf8; | |
""" | |
sql_insert_statement = \ | |
"""insert into lemma_type | |
( | |
id | |
) | |
values (%s) | |
""" | |
class lemma: | |
""" arabic trees have extra lemma information """ | |
def __init__(self, input_string, b_transliteration, comment, index, offset, unvocalized_string, | |
vocalized_string, vocalized_input, pos, gloss, lemma, coarse_sense, leaf_id): | |
self.input_string = input_string | |
self.b_transliteration = b_transliteration | |
self.comment = comment | |
self.index = index | |
self.offset = offset | |
self.unvocalized_string = unvocalized_string | |
self.vocalized_string = vocalized_string | |
self.vocalized_input = vocalized_input | |
self.pos = pos | |
self.gloss = gloss | |
self.lemma = lemma | |
self.coarse_sense = coarse_sense | |
self.leaf_id = leaf_id | |
self.id = "%s@%s" % (self.lemma, self.leaf_id) | |
sql_table_name = "lemma" | |
def __repr__(self): | |
return "\n".join(["lemma instance:", | |
" input_string: " + self.input_string, | |
" vocalized_input: " + self.vocalized_input, | |
" unvocalized_string: " + self.unvocalized_string, | |
" vocalized_string: " + self.vocalized_string, | |
" gloss: " + self.gloss, | |
" index: %s" % self.index, | |
" offset: %s" % self.offset]) | |
def __str__(self): | |
tr = ["INPUT STRING:%s" % self.input_string, | |
" IS_TRANS:%s" % self.b_transliteration, | |
" COMMENT:%s" % self.comment, | |
" INDEX:%s" % self.index, | |
" OFFSETS:%s" % self.offset, | |
" UNVOCALIZED:%s" % self.unvocalized_string, | |
" VOCALIZED:%s" % self.vocalized_string, | |
" VOC_STRING:%s" % self.vocalized_input, | |
" POS:%s" % self.pos, | |
" GLOSS:%s" % self.gloss] | |
if self.lemma != "lemma_not_set": | |
if self.coarse_sense: | |
lemma_str = "%s_%s" % (self.lemma, self.coarse_sense) | |
else: | |
lemma_str = self.lemma | |
tr.append(" LEMMA: [%s]" % lemma_str) | |
return "\n".join(tr) | |
@staticmethod | |
def from_db(a_leaf_id, a_cursor): | |
a_cursor.execute("SELECT * FROM lemma WHERE leaf_id = '%s'" % a_leaf_id) | |
rows = a_cursor.fetchall() | |
if not rows: | |
return None | |
if len(rows) != 1: | |
assert all(row["lemma"] == rows[0]["lemma"] for row in rows), \ | |
"\n".join(", ".join(": ".join(a) for a in row.iteritems()) for row in rows) | |
r = rows[0] | |
return lemma(r["input_string"], | |
r["b_transliteration"], | |
r["comment"], | |
r["lemma_index"], | |
r["lemma_offset"], | |
r["unvocalized_string"], | |
r["vocalized_string"], | |
r["vocalized_input"], | |
r["pos"], | |
r["gloss"], | |
r["lemma"], | |
r["coarse_sense"], | |
r["leaf_id"]) | |
# sql create statement for the syntactic_link table | |
sql_create_statement = \ | |
""" | |
create table lemma | |
( | |
id varchar(255) not null, | |
input_string varchar(255), | |
b_transliteration varchar(255), | |
comment varchar(255), | |
lemma_index varchar(255), | |
lemma_offset varchar(255), | |
unvocalized_string varchar(255), | |
vocalized_string varchar(255), | |
vocalized_input varchar(255), | |
pos varchar(255), | |
gloss varchar(255), | |
lemma varchar(255), | |
coarse_sense varchar(16), | |
leaf_id varchar(255), | |
foreign key (leaf_id) references tree.id | |
) | |
default character set utf8; | |
""" | |
# sql insert statement for the syntactic_link table | |
sql_insert_statement = \ | |
""" | |
insert into lemma | |
( | |
id, | |
input_string, | |
b_transliteration, | |
comment, | |
lemma_index, | |
lemma_offset, | |
unvocalized_string, | |
vocalized_string, | |
vocalized_input, | |
pos, | |
gloss, | |
lemma, | |
coarse_sense, | |
leaf_id | |
) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) | |
""" | |
def write_to_db(self, cursor): | |
data = [(self.id, self.input_string, self.b_transliteration, self.comment, self.index, | |
self.offset, self.unvocalized_string, self.vocalized_string, self.vocalized_input, | |
self.pos, self.gloss, self.lemma, self.coarse_sense, self.leaf_id)] | |
cursor.executemany("%s" % (self.__class__.sql_insert_statement), data) | |
def iterate_trees(string_seq): | |
""" | |
given string_seq which is a sequence of strings, read from | |
string_seq and produce strings one at a time that represent trees. | |
""" | |
return [x for x in _iterate_trees_helper(string_seq) if x.strip()] | |
def _iterate_trees_helper(string_seq): | |
parens = 0 | |
cur_parse = [] | |
for s in string_seq: | |
if (s.startswith(";") or s.startswith("<") or s.startswith("*")) and s.endswith("\n"): | |
continue # ignore comments and sgml | |
for c in s: | |
if c == "(" and parens == 0 and cur_parse: | |
yield "".join(cur_parse) | |
cur_parse = [] | |
cur_parse.append(c) | |
if c == "(": | |
parens += 1 | |
elif c == ")": | |
parens -= 1 | |
if parens == 0: | |
yield "".join(cur_parse).strip() | |
cur_parse = [] | |
if parens != 0: | |
raise Exception("Parens should have been zero at end, were %s" % parens) | |
if "".join(cur_parse).strip(): | |
raise Exception("curparse should have been empty at end, was %s" % cur_parse) | |
class InvalidSexprException(Exception): | |
def __init__(self, sexpr, parent=None): | |
self.sexpr = sexpr | |
self.parent = parent | |
def __str__(self): | |
ns = "" | |
ns += self.sexpr | |
if self.parent: | |
ns += "\n\n" | |
ns += str(self.parent) | |
return ns | |
def parse_sexpr(s): | |
""" turn an s-expression into a tree of lists: | |
(a (b c) d) -> [a, [b, c], d] | |
uses spaces and parens only -- no way to have a token with a space in it | |
""" | |
s = s.replace("\n", " ").replace("\t"," ").strip() | |
if not s.startswith("(") and not s.endswith(")"): | |
return s | |
elif s.startswith("(") and s.endswith(")"): | |
tr = [] | |
cur = [] | |
parens = 0 | |
for c in s[1:-1].strip() + " ": | |
if c == "(": | |
parens += 1 | |
elif c == ")": | |
parens -= 1 | |
elif c == " " and cur: | |
if parens == 0: | |
try: | |
x = parse_sexpr("".join(cur)) | |
except InvalidSexprException as e: | |
raise InvalidSexprException("Parent: %s" % s, e) | |
if x: | |
tr.append(x) | |
cur = [] | |
cur.append(c) | |
if (cur and cur != [" "]) or parens != 0: | |
raise InvalidSexprException("Invalid s-expression: " + s + " note: %s" % "".join(cur) + " parens: %s" % parens) | |
return tr | |
else: | |
raise InvalidSexprException("Invalid s-expression: \n" + s) | |
def unparse_sexpr(l): | |
if type(l) == type([]): | |
return "(" + " ".join(unparse_sexpr(a) for a in l) + ")" | |
return str(l) | |
def pretty_print_tree_string(a_tree_string, offset=''): | |
if not a_tree_string.strip(): | |
return "" | |
# Maximum depth we're prepared for in trees | |
maxdepth=100 | |
maxindent=300 | |
# Table of indentation at tree depth | |
depth_to_indent = [0 for i in xrange(maxdepth)] | |
# Initialize indent_string[i] to be a string of i spaces | |
indent_string = ['' for i in xrange(maxindent)] | |
for i in xrange(maxindent-1): | |
indent_string[i+1] = indent_string[i] + ' ' | |
# RE object for split that matches on a ')' followed by not a ')', but only consumes the ')' | |
close_paren = re.compile(r'\)(?=\s*[^\s\)])') | |
# RE object to pick up first on this line(should be only) POS tag and the word of each lexical leaf of the tree | |
lexical_leaf = re.compile(r'\((?P<tag>[^\s\)\(]+)\s+(?P<word>[^\s\)\(]+)\)') | |
# RE object to parse OntoNotes Normal Form tree lines: | |
a_tree = a_tree_string | |
pp_tree = "" | |
def treeindent(depth): | |
return indent_string[depth_to_indent[depth]]+offset #Indent to appropriate point | |
current_depth = 0 | |
for frag in close_paren.split(a_tree): #Split input into lines ending with a lexical item | |
if frag[-1]!= '\n': | |
frag=frag+')' | |
else: frag=frag[0:-1] | |
#Indent to appropriate point | |
pp_tree += treeindent(current_depth) | |
pfrag = "" | |
for pfrag in (frag).split('(')[1:]: # Split line into segments each beginning with an '(' | |
pfrag='('+pfrag # Restore deleted initial '(' | |
pp_tree += pfrag # Print each | |
current_depth=current_depth+1 # Up the current depth count | |
# Remember how far to indent following items at this depth | |
depth_to_indent[current_depth]=depth_to_indent[current_depth-1]+len(pfrag) | |
current_depth=current_depth-pfrag.count(')') # Correct depth given closing parens | |
if current_depth<=0: | |
pp_tree += '' # Separate toplevel trees with blank lines | |
pp_tree += '\n' # Print CRLF | |
return re.sub("\)$", "", pp_tree) | |
DONT_DELETE_TREES = True | |
def car(sp): | |
return sp[0] | |
def cdr(sp): | |
return sp[1:] | |
def split_node(sp): | |
return car(sp), cdr(sp) | |
def is_leaf(sp): | |
return len(sp) == 2 and type(sp[1]) != type([]) | |
transformations = {} | |
def pp(sexpr, out_text=False): | |
""" pretty print the S-expr, or just spit text out if out_text is true | |
out_text also skips traces | |
""" | |
if not out_text: | |
return pretty_print_tree_string(unparse_sexpr(sexpr)) | |
else: | |
words = [word for tag, word in all_leaves(sexpr) | |
if tag != "-NONE-"] # skip traces | |
return "\n".join(words) | |
def transforms(transformation): | |
assert transformation.startswith("+") or transformation.startswith("-") | |
def regfunc(func): | |
transformations[transformation] = func | |
return func | |
return regfunc | |
def require(b): | |
if not b: | |
raise Exception("Failed Requirement") | |
@transforms("-edited") | |
def remove_edits(sp): | |
"""Remove subtrees tagged 'EDITED' (disfluencies) """ | |
return remove_tagger(sp, "EDITED") | |
@transforms("-trace") | |
def remove_edits(sp): | |
"""Remove traces part of speech tagged '-NONE-' """ | |
return remove_tagger(sp, "-NONE-") | |
@transforms("-phrase-tags") | |
def all_leaves(sp): | |
"""Make a tree of just the leaves | |
.. code-block: scheme | |
(TOP (S (NP-SBJ (NNP Zambia)) | |
(VP (VBD had) | |
(ADVP-TMP (RB previously)) | |
(VP (VBD lost) | |
(NP (PRP$ its) | |
(RB away) | |
(VBD game)) | |
(NP-ADV (NP (CD 0)) | |
(PP (SYM -) | |
(NP (CD 1)))))) | |
(. .))) | |
becomes | |
.. code-block: scheme | |
( (NNP Zambia) | |
(VBD Had) | |
(RB Previously) | |
(VBD lost) | |
(PRP$ its) | |
(RB away) | |
(VBG game) | |
(CD 0) | |
(SYM -) | |
(CD 0) ) | |
""" | |
tag, rest = split_node(sp) | |
if is_leaf(sp): | |
return [[tag, rest[0]]] | |
tr = [] | |
for x in rest: | |
tr.extend(all_leaves(x)) | |
return tr | |
def remove_tagger(sp, tag_to_remove): | |
""" remove tag_to_remove from sp, culling empty branches """ | |
def callback(tag, rest): | |
return tag == tag_to_remove | |
return remover(sp, callback) | |
def remover(sp, callback): | |
tag, rest = split_node(sp) | |
if callback(tag, rest): | |
return [] | |
if is_leaf(sp): | |
return sp | |
new_rest = [y for y in [remover(x, callback) for x in rest] if y] | |
if not new_rest: | |
return [] | |
return [tag] + new_rest | |
def pad_items_in_list(a_list, a_character=None): | |
""" | |
this function will return the same list with the right amount of | |
padding equal to two spaces on each side of the widest string. it | |
will perform right justification. | |
if the optional character is specified, then it will do a | |
centering around the character in the process of padding. | |
left/right justification does not work with this option. | |
""" | |
if(a_character != None): | |
for an_item in a_list: | |
if(an_item.find(a_character) == -1): | |
a_character = None | |
break | |
if(a_character != None): | |
lmax=0 | |
rmax=0 | |
for an_item in a_list: | |
an_item = an_item.strip() | |
lf = an_item.find("*") | |
if(lmax < lf): | |
lmax = lf | |
rf = len(an_item) - an_item.find("*") | |
if(rmax < rf): | |
rmax = rf | |
i=0 | |
for i in range(0, len(a_list)): | |
a_list[i] = a_list[i].strip() | |
x = a_list[i].find(a_character) | |
len_i=len(a_list[i]) | |
a_list[i] = " "*(lmax-x+2) + a_list[i] | |
a_list[i] = a_list[i] + " "*(rmax-len_i+x+2) | |
else: | |
max=0 | |
for an_item in a_list: | |
an_item = an_item.strip() | |
x = len(an_item) | |
if(max < x): | |
max = x | |
i=0 | |
for i in range(0, len(a_list)): | |
a_list[i] = a_list[i].strip() | |
if(a_list[i].endswith("*") or | |
a_list[i].endswith("-") or | |
a_list[i][-1] in string.digits ): | |
a_list[i] = "%s " % (a_list[i]) | |
a_list[i] = a_list[i].rjust(max+2) | |
return a_list | |
def rows2columns(matrix): | |
columns = [] | |
for row in matrix: | |
c=0 | |
for cell in row: | |
if(c == len(columns)): | |
columns.append([]) | |
columns[c].append(cell) | |
c = c + 1 | |
return columns | |
def pretty_print_table(rows, separator=None, out_file=None): | |
# cells is the matrix | |
r_c_matrix = [] | |
for row in rows: | |
r_c_matrix.append(row.split()) | |
c_r_matrix = rows2columns(r_c_matrix) | |
for i in range(0, len(c_r_matrix)): | |
if(i==5 or i>10): | |
padding_character=separator | |
else: | |
padding_character=None | |
c_r_matrix[i] = pad_items_in_list(c_r_matrix[i], padding_character) | |
r_c_matrix = rows2columns(c_r_matrix) | |
if(out_file == None): | |
for row in r_c_matrix: | |
print(" ".join(row).strip()) | |
elif(out_file == "-"): | |
rows=[] | |
for row in r_c_matrix: | |
rows.append(" ".join(row).strip()) | |
return "%s\n" % ("\n".join(rows)) | |
else: | |
raise NotImplementedError("this functionality has not yet been implemented") | |
def start(input_fname, conll_fname, output_fname, encoding, changes): | |
""" apply changes in order to the trees in input_fname, write to output_fname """ | |
out_text = False | |
topless = False | |
if "--text" in changes: | |
out_text = True | |
changes.remove("--text") | |
if "--topless" in changes: | |
topless = True | |
changes.remove("--topless") | |
out = [] | |
with codecs.open(input_fname, "r", encoding) as inf: | |
if topless: | |
e = inf.read().replace("( (","(TOP (") | |
inf = StringIO(e) | |
for a_tree in iterate_trees(inf): | |
sexpr = parse_sexpr(a_tree) | |
for change in changes: | |
if not sexpr: | |
continue | |
try: | |
change_func = transformations[change] | |
except KeyError: | |
raise Exception("Invalid argument '%s' for change. Allowed changes are: %s" % (change, transformations.keys())) | |
try: | |
old_sexpr = sexpr[:] | |
sexpr = change_func(sexpr) | |
except Exception: | |
sys.stderr.write("ERR in %s\n\nTree:\n%s\n\nInput sexpr:\n%s\n" % (change, a_tree, pp(sexpr))) | |
raise | |
if not sexpr and DONT_DELETE_TREES: | |
nullp = ["XX", "nullp"] | |
if old_sexpr and old_sexpr[0] == "TOP": | |
sexpr = ["TOP", nullp] | |
else: | |
sexpr = nullp | |
if sexpr: | |
out.append(pp(sexpr, out_text)) | |
w_list = [] | |
for o in out: | |
w_list.append(o.split("\n")) | |
num_words = 0 | |
for a_word_list in w_list: | |
for a_word in a_word_list: | |
num_words = num_words + 1 | |
debug("number of words: %d\n" % (num_words), DEBUG, MAX_VERBOSITY) | |
debug("input_fname: %s" % (input_fname), DEBUG, MAX_VERBOSITY) | |
is_arabic = False | |
a_list_of_lemmas = [] | |
if re.search('data%s+arabic%s+annotations' % (os.sep, os.sep), input_fname): | |
is_arabic = True | |
if is_arabic is True: | |
lemma_fname = re.sub("\.parse$", ".lemma", input_fname) | |
debug("lemma_fname: %s" % (lemma_fname), DEBUG, MAX_VERBOSITY) | |
if os.path.exists(lemma_fname): | |
lemma_file = codecs.open(lemma_fname, "r", "utf-8") | |
actual_word_list = [] | |
buckwalter_word_list = [] | |
lemma_list = [] | |
input_string_regex = re.compile(r"^\s*INPUT STRING:(.*)", re.U|re.MULTILINE) | |
buckwalter_regex = re.compile(r"^\s*IS_TRANS:(.*)", re.U|re.MULTILINE) | |
comment_regex = re.compile(r"^\s*COMMENT:(.*)", re.U|re.MULTILINE) | |
index_regex = re.compile(r"^\s*INDEX:(.*)", re.U|re.MULTILINE) | |
offsets_regex = re.compile(r"^\s*OFFSETS:(.*)", re.U|re.MULTILINE) | |
unvocalized_string_regex = re.compile(r"^\s*UNVOCALIZED:(.*)", re.U|re.MULTILINE) | |
vocalized_string_regex = re.compile(r"^\s*VOCALIZED:(.*)", re.U|re.MULTILINE) | |
vocalized_input_string_regex = re.compile(r"^\s*VOC_STRING:(.*)", re.U|re.MULTILINE) | |
pos_string_regex = re.compile(r"^\s*POS:(.*)", re.U|re.MULTILINE) | |
gloss_string_regex = re.compile(r"^\s*GLOSS:(.*)", re.U|re.MULTILINE) | |
lemma_regex = re.compile(r"LEMMA:\s+\[([^\]]*)\]", re.U|re.MULTILINE) | |
lemma_file_lines = lemma_file.readlines() | |
list_of_lemma_blocks = [] | |
i=0 | |
lemma_block = "" | |
list_of_lemma_blocks = [] | |
while(i<len(lemma_file_lines)): | |
input_string_regex_match = input_string_regex.findall(lemma_file_lines[i]) | |
if(input_string_regex_match != []): | |
while(i<len(lemma_file_lines) and lemma_file_lines[i].strip() != ""): | |
lemma_block = "%s%s" % (lemma_block, lemma_file_lines[i]) | |
i=i+1 | |
if(lemma_block.strip() != ""): | |
list_of_lemma_blocks.append(lemma_block) | |
lemma_block = "" | |
i=i+1 | |
list_of_input_strings = [] | |
list_of_b_transliterations = [] | |
list_of_comments = [] | |
list_of_indices = [] | |
list_of_offsets = [] | |
list_of_unvocalized_strings = [] | |
list_of_vocalized_strings = [] | |
list_of_vocalized_inputs = [] | |
list_of_pos = [] | |
list_of_glosses = [] | |
list_of_lemmas = [] | |
for lemma_block in list_of_lemma_blocks: | |
for a_list, a_regex, a_name in [[list_of_input_strings, input_string_regex, "input"], | |
[list_of_b_transliterations, buckwalter_regex, "transliterations"], | |
[list_of_comments, comment_regex, "comment"], | |
[list_of_indices, index_regex, "indecies"], | |
[list_of_offsets, offsets_regex, "offsets"], | |
[list_of_unvocalized_strings, unvocalized_string_regex, "unvocalized_strings"], | |
[list_of_vocalized_strings, vocalized_string_regex, "vocalized_strings"], | |
[list_of_vocalized_inputs, vocalized_input_string_regex, "vocalized_inputs"], | |
[list_of_pos, pos_string_regex, "pos_strings"], | |
[list_of_glosses, gloss_string_regex, "gloss_strings"], | |
[list_of_lemmas, lemma_regex, "lemmas"]]: | |
try: | |
a_list.append(a_regex.findall(lemma_block)[0]) | |
except IndexError: | |
if a_name == "lemmas": | |
list_of_lemmas.append("lemma_not_set") | |
else: | |
raise Exception("Didn't find any %s in %s (%s)" % (a_name, ("\n" + lemma_block).replace("\n", "\n "), lemma_fname)) | |
# temporarily copying the lists to another list used earlier | |
actual_word_list = [] + list_of_input_strings | |
buckwalter_word_list = [] + list_of_b_transliterations | |
lemma_list = [] + list_of_lemmas | |
debug("len(actual_word_list): %s\n" % (len(actual_word_list)), DEBUG, MAX_VERBOSITY) | |
debug("actual_word_list: %s\n" % (actual_word_list), DEBUG, MAX_VERBOSITY) | |
debug("len(buckwalter_word_list): %s\n" % (len(buckwalter_word_list)), DEBUG, MAX_VERBOSITY) | |
debug("buckwalter_word_list: %s\n" % (buckwalter_word_list), DEBUG, MAX_VERBOSITY) | |
debug("len(lemma_list): %s\n" % (len(lemma_list)), DEBUG, MAX_VERBOSITY) | |
debug("lemma_list: %s\n" % (lemma_list), DEBUG, MAX_VERBOSITY) | |
if(len(actual_word_list) != len(buckwalter_word_list) | |
or | |
len(actual_word_list) != len(lemma_list)): | |
debug("len(actual_word_list): %s\n" % (len(actual_word_list)), DEBUG, MAX_VERBOSITY) | |
debug("len(buckwalter_word_list): %s\n" % (len(buckwalter_word_list)), DEBUG, MAX_VERBOSITY) | |
debug("len(lemma_list): %s\n" % (len(lemma_list)), DEBUG, MAX_VERBOSITY) | |
raise Exception("the three lists -- actual word, buckwalter word, and lemma should be the same length, or else some information might be missing from the .lemma file") | |
for i in range(0, len(actual_word_list)): | |
if(lemma_list[i] == "DEFAULT" | |
or | |
buckwalter_word_list[i] == ""): | |
debug("%s %s %s\n" % (actual_word_list[i].rjust(50), buckwalter_word_list[i].rjust(50), lemma_list[i].rjust(50)), DEBUG, MAX_VERBOSITY) | |
for i in range(0, len(actual_word_list)): | |
lemma_lemma = list_of_lemmas[i] | |
coarse_sense = "" | |
if "_" in lemma_lemma and lemma_lemma != "lemma_not_set": | |
try: | |
lemma_lemma, coarse_sense = lemma_lemma.split("_") | |
except ValueError: | |
raise | |
lemma_object = lemma(list_of_input_strings[i], | |
list_of_b_transliterations[i], | |
list_of_comments[i], | |
list_of_indices[i], | |
list_of_offsets[i], | |
list_of_unvocalized_strings[i], | |
list_of_vocalized_strings[i], | |
list_of_vocalized_inputs[i], | |
list_of_pos[i], | |
list_of_glosses[i], | |
lemma_lemma, | |
coarse_sense, | |
i) | |
debug("lemma_object: %s" % (lemma_object), DEBUG, MAX_VERBOSITY) | |
a_list_of_lemmas.append(lemma_object) | |
f=codecs.open(output_fname, "w", encoding) | |
f.close() | |
sentences = [] | |
i=0 | |
w=0 | |
conll_file = codecs.open(conll_fname, "r", encoding) | |
for line in conll_file: | |
if(line.strip() == ""): | |
assert len(sentences) == len(w_list[i]), "the example should contain the same number of words as the words in the parse" | |
if(a_list_of_lemmas != []): | |
assert len(a_list_of_lemmas) == num_words, "the list of lemmas does not match the list of words. please report this issue." | |
rows=[] | |
c=0 | |
for columns in sentences: | |
if a_list_of_lemmas != []: | |
columns[WORD_COLUMN] = "%s#%s#%s#%s" % (w_list[i][c], a_list_of_lemmas[w].lemma.strip(), a_list_of_lemmas[w].unvocalized_string.strip(), a_list_of_lemmas[w].vocalized_string.strip()) | |
if DEBUG: | |
if columns[LEMMA_COLUMN] == a_list_of_lemmas[w].lemma.strip(): | |
print("found the same lemma") | |
else: | |
raise Exception("Something is wrong: %s %s %s" % (columns[LEMMA_COLUMN], a_list_of_lemmas[w].lemma.strip(), " ".join(columns))) | |
columns[LEMMA_COLUMN] = a_list_of_lemmas[w].lemma.strip() | |
else: | |
columns[WORD_COLUMN] = w_list[i][c] | |
rows.append(" ".join(columns)) | |
c=c+1 | |
w=w+1 | |
pretty_print_table_string = pretty_print_table(rows, out_file="-") | |
if output_fname == "-": | |
print(pretty_print_table_string) | |
else: | |
with codecs.open(output_fname, "a", encoding) as outf: | |
outf.write("%s\n" % (pretty_print_table_string)) | |
sentences = [] | |
i=i+1 | |
elif(line.startswith("#")): | |
if output_fname == "-": | |
print(line.strip()) | |
else: | |
with codecs.open(output_fname, "a", encoding) as outf: | |
outf.write("%s\n" % (line.strip())) | |
else: | |
sentences.append(line.split()) | |
if __name__ == "__main__": | |
encoding = "utf8" | |
if "--gb18030" in sys.argv: | |
encoding="gb18030" | |
sys.argv.remove("--gb18030") | |
if len(sys.argv[1:]) == 2 and sys.argv[1] in ["--help", "-h"] and sys.argv[2] in transformations: | |
print() | |
print(" ", transformations[sys.argv[2]].__doc__) | |
elif not sys.argv[1:] or "--help" in sys.argv[1:] or "-h" in sys.argv[1:]: | |
print() | |
print("-"*120) | |
print("Usage: python skeleton2conll.py <ontonotes-parse-file> <input-skel-file> <conll-output-file> [transformations] ...") | |
print("\nAllowed transforms:") | |
max_key_len = max(len(t) for t in transformations) + 1 # +1 for colon | |
for key in transformations: | |
print(" %s %s" %(("%s:"%key).rjust(max_key_len), | |
transformations[key].__doc__.split("\n")[0])) | |
print(" %s %s" % ("--text:".rjust(max_key_len), | |
"Produce text output instead of parse trees")) | |
print() | |
print() | |
print("Example:") | |
print("python skeleton2conll.py <ontonotes-release-directory>/data/.../bc/cnn/00/cnn_0000.parse conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_skel conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_conll -edited --text") | |
print("-"*120) | |
else: | |
input_fname, conll_fname, output_fname, changes = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4:] | |
start(codecs.open(input_fname).read().replace("( (","(TOP ("), conll_fname, output_fname, encoding, changes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment