Skip to content

Instantly share code, notes, and snippets.

@ljvmiranda921
Created May 23, 2022 08:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ljvmiranda921/dafdfd24c5a2904d210eb9c455b34941 to your computer and use it in GitHub Desktop.
Save ljvmiranda921/dafdfd24c5a2904d210eb9c455b34941 to your computer and use it in GitHub Desktop.
Spans key weird behaviour
import spacy
from spacy.tokens import DocBin, SpanGroup
from wasabi import msg
from copy import copy
def main():
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
spans_key = "sc"
msg.info("Trying out SpanGroup approach")
doc_sg = copy(doc) # copy doc to make sure it's a different one
if spans_key not in doc_sg.spans:
doc_sg.spans[spans_key] = SpanGroup(doc_sg)
doc_sg.spans[spans_key].extend(list(doc_sg.ents))
db_sg = DocBin(docs=[doc_sg])
msg.text("Deserializing the DocBin and checking the output...")
deserialized_docs_sg = list(db_sg.get_docs(nlp.vocab))
print(deserialized_docs_sg[0].spans)
msg.info("Trying out iteration approach")
doc_it = copy(doc) # copy doc to make sure it's a different one
spans = [ent for ent in doc_it.ents]
group = SpanGroup(doc_it, name=spans_key, spans=spans)
doc_it.spans[spans_key] = group
db_it = DocBin(docs=[doc_it])
msg.text("Deserializing the DocBin and checking the output...")
deserialized_docs_it = list(db_it.get_docs(nlp.vocab))
print(deserialized_docs_it[0].spans)
if __name__ == "__main__":
main()
@ljvmiranda921
Copy link
Author

Sample output:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment