Skip to content

Instantly share code, notes, and snippets.

@wpm
Created December 22, 2017 19:30
Show Gist options
  • Save wpm/2a40784364a7398b556fb63124acf32c to your computer and use it in GitHub Desktop.
Save wpm/2a40784364a7398b556fb63124acf32c to your computer and use it in GitHub Desktop.
Utility that matches text patterns in spaCy/Prodigy training data
import json
from json import JSONDecodeError
from typing import Sequence, Iterable, List
import click
import spacy
from spacy.matcher import Matcher
def match_patterns(nlp, patterns: Sequence[dict], corpus: Iterable[str]) -> Iterable[str]:
matcher = Matcher(nlp.vocab)
matcher.add("Pattern Matcher", None, *patterns)
for document in nlp.pipe(corpus):
for _, start, end in matcher(document):
yield document[start:end]
class Patterns(click.ParamType):
name = "patterns"
def convert(self, value: str, _, __) -> List[dict]:
ext = value.split(".")[-1]
try:
with open(value) as f:
if ext == "jsonl":
patterns = [json.loads(line)["pattern"] for line in f.readlines()]
else:
patterns = [obj["pattern"] for obj in json.load(f)]
if not isinstance(patterns, list):
self.fail("Invalid patterns file.")
return patterns
except OSError:
self.fail("Cannot open {value}.")
except JSONDecodeError as e:
self.fail("Invalid JSON {e}")
class Corpus(click.ParamType):
name = "corpus"
def convert(self, value: str, _, __) -> Iterable[str]:
ext = value.split(".")[-1]
try:
with open(value) as f:
if ext == "json":
return (item["text"] for item in json.load(f))
elif ext == "jsonl":
return (json.loads(line)["text"] for line in f.readlines())
else:
return f.readlines()
except OSError:
self.fail("Cannot open {value}")
except JSONDecodeError as e:
self.fail("Invalid JSON {e}")
@click.command()
@click.argument("corpus", type=Corpus())
@click.argument("patterns", type=Patterns())
@click.option("--language-model", default="en", help="spaCy language model (default 'en')")
def pattern_match(corpus: Iterable[str], patterns: List[dict], language_model: str):
"""
Print all the strings in the CORPUS that match the PATTERNS.
CORPUS is a .txt, .json, or .jsonl file that can be used as input to Prodigy.
PATTERNS is a .json file that can be passed to Prodigy's --patterns option.
"""
for match in match_patterns(spacy.load(language_model), patterns, corpus):
click.echo(match)
if __name__ == "__main__":
pattern_match()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment