Skip to content

Instantly share code, notes, and snippets.

@simonepri
Last active May 23, 2020 09:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonepri/f4ad91e935a066d39d39e9b0da472988 to your computer and use it in GitHub Desktop.
Save simonepri/f4ad91e935a066d39d39e9b0da472988 to your computer and use it in GitHub Desktop.
GED Utils
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/bin/env python3
from typing import * # pylint: disable=wildcard-import,unused-wildcard-import
import argparse
import os
import re
def main(args: argparse.Namespace) -> None:
with open(args.m2_file_path, "r") as in_m2:
# Load the M2 file and split into blocks
m2_blocks = in_m2.read().strip().split("\n\n")
with open(args.output, "w+") as out_conll:
# Loop though the M2 file blocks
for m2_block in m2_blocks:
info = m2_block.split("\n")
# In the M2 format, space edits are also space separated.
# We insert a tab character where appropiate to simply token
# splitting in the next line.
info[0] = re.sub(r"(?<=[^ ])([ ])|([ ])(?=[^ ])", "\t", info[0])
# Get the orig sent and edits
orig = info[0].split("\t")[1:] # 1: ignores "S"
edits = info[1:]
# Get the indexes of the edited tokens
edit_indexes = get_edit_indexes(edits, args.annotator)
# Loop through tokens
for idx, tok in enumerate(orig):
# Spaces
if tok.strip() == "":
continue
# Incorrect
if idx in edit_indexes:
out_conll.write("\t".join([tok, "B-INC"]) + "\n")
# Correct
else:
out_conll.write("\t".join([tok, "B-COR"]) + "\n")
# Newline at end of sentence
out_conll.write("\n")
def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
"""
Get token indexes in the original sentence that are modified by the edits
provided.
Args:
edits: A list of edit lines from an m2 file
annotator_id: The annotator id to select
Returns:
A set of edited token indexes. Missing words affect the next token.
"""
edit_indexes = []
for edit in edits:
parts = edit.split("|||")
# Get edit type
edit_type = parts[1]
# Get edit annotator id
edit_annotator_id = int(parts[5])
# Get the edit start and end span
edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))
# Ignore noop edits; i.e. no errors
if edit_type == "noop":
continue
# Choose only edits by the specified annotator
if edit_annotator_id != annotator_id:
continue
if edit_start_idx == edit_end_idx:
# Missing words defined as affecting the next token
edit_indexes.append(edit_start_idx)
else:
# Other edits may be more than one token
edit_indexes.extend(range(edit_start_idx, edit_end_idx))
# Convert output to a set to remove duplicates and speedup lookup
return set(edit_indexes)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
parser.add_argument(
"m2_file_path", type=str, help="Path to a M2 file."
)
parser.add_argument(
"--output", "-o", type=str, help="The output filepath.", required=True,
)
parser.add_argument(
"--annotator",
"-a",
type=int,
default=0,
help="The annotator id to select.",
)
parser.add_argument(
"--debug",
action="store_true",
help="If provided it provides additional logging in case of errors.",
)
args = parser.parse_args()
return args
def normalize_args(args: argparse.Namespace) -> None:
args.m2_file_path = os.path.realpath(args.m2_file_path)
def validate_args(args: argparse.Namespace) -> None:
if args.m2_file_path != "-":
if not os.path.isfile(args.m2_file_path):
raise ValueError("The provided M2 file path is invalid.")
def run() -> None:
try:
args = parse_args()
normalize_args(args)
validate_args(args)
main(args)
except KeyboardInterrupt:
print("\nAborted!")
except Exception as err: # pylint: disable=broad-except
if args.debug:
raise
print("Error: %s" % err)
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment