Skip to content

Instantly share code, notes, and snippets.

@thomwolf
Last active June 11, 2018 13:50
Show Gist options
  • Save thomwolf/85603dcd70269a26bf8567f4f1d018b9 to your computer and use it in GitHub Desktop.
Save thomwolf/85603dcd70269a26bf8567f4f1d018b9 to your computer and use it in GitHub Desktop.
Example of Cython loop to count the occurrences of the word "run" tagged as noun by spaCy in a portion of Wikitext2
%%cython -+
import numpy # Sometime we have a fail to import numpy compilation error if we don't import numpy
from cymem.cymem cimport Pool
from spacy.tokens.doc cimport Doc
from spacy.typedefs cimport hash_t
from spacy.structs cimport TokenC
cdef struct DocElement:
TokenC* c
int length
cdef int fast_loop(DocElement* docs, int n_docs, hash_t word, hash_t tag):
cdef int n_out = 0
for doc in docs[:n_docs]:
for c in doc.c[:doc.length]:
if c.lex.lower == word and c.tag == tag:
n_out += 1
return n_out
def main_nlp_fast(doc_list):
cdef int i, n_out, n_docs = len(doc_list)
cdef Pool mem = Pool()
cdef DocElement* docs = <DocElement*>mem.alloc(n_docs, sizeof(DocElement))
cdef Doc doc
for i, doc in enumerate(doc_list): # Populate our database structure
docs[i].c = doc.c
docs[i].length = (<Doc>doc).length
word_hash = doc.vocab.strings.add('run')
tag_hash = doc.vocab.strings.add('NN')
n_out = fast_loop(docs, n_docs, word_hash, tag_hash)
print(n_out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment