Skip to content

Instantly share code, notes, and snippets.

@EmilStenstrom
Created August 31, 2018 08:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmilStenstrom/3ea0c1ed99c8d653e0a3a4720dfce2cd to your computer and use it in GitHub Desktop.
Save EmilStenstrom/3ea0c1ed99c8d653e0a3a4720dfce2cd to your computer and use it in GitHub Desktop.
Python wrapper for the (very raw) python interface for ufal.udpipe. Saving it here for posterity.
import re
from ufal.udpipe import Model, Pipeline, ProcessingError
class Parser:
MODELS = {
"swe": "data/swedish-ud-2.0-170801.udpipe",
}
def __init__(self, language):
model_path = self.MODELS.get(language, None)
if not model_path:
raise ParserException("Cannot find model for language '%s'" % language)
model = Model.load(model_path)
if not model:
raise ParserException("Cannot load model from file '%s'\n" % model_path)
self.model = model
def parse(self, text):
text = text.strip()
# Adding a period improves detection on especially short sentences
period_added = False
last_character = text.strip()[-1]
if re.match(r"\w", last_character, flags=re.UNICODE):
text += "."
period_added = True
pipeline = Pipeline(
self.model,
"tokenize",
Pipeline.DEFAULT,
Pipeline.DEFAULT,
"conllu"
)
error = ProcessingError()
processed = pipeline.process(text, error)
if error.occurred():
raise ParserException(error.message)
# Remove the period to make sure input corresponds to output
if period_added:
processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n"
return processed
class ParserException(Exception):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment