Skip to content

Instantly share code, notes, and snippets.

@jacquesfize
Last active June 11, 2022 14:59
Show Gist options
  • Save jacquesfize/5086c7c4f6c56e9d3c7cfb1eb0010fe8 to your computer and use it in GitHub Desktop.
Save jacquesfize/5086c7c4f6c56e9d3c7cfb1eb0010fe8 to your computer and use it in GitHub Desktop.
A function to delete tokens from a spacy Doc object without losing associated information (PartOfSpeech, Dependance, Lemma, ...)
def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]):
"""
Remove tokens from a Spacy *Doc* object without losing
associated information (PartOfSpeech, Dependance, Lemma, extensions, ...)
Parameters
----------
doc : spacy.tokens.doc.Doc
spacy representation of the text
index_to_del : list of integer
positions of each token you want to delete from the document
list_attr : list, optional
Contains the Spacy attributes you want to keep (the default is
[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP])
Returns
-------
spacy.tokens.doc.Doc
Filtered version of doc
"""
np_array = doc.to_array(list_attr) # Array representation of Doc
# Creating a mask: boolean array of the indexes to delete
mask_to_del = np.ones(len(np_array), np.bool)
mask_to_del[index_to_del] = 0
np_array_2 = np_array[mask_to_del]
doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del])
doc2.from_array(list_attr, np_array_2)
### Modification made by @yarongon https://gist.github.com/Jacobe2169/5086c7c4f6c56e9d3c7cfb1eb0010fe8#gistcomment-2941380
# Handling user extensions
# The `doc.user_data` dictionary is holding the data backing user-defined attributes.
# The data is based on characters offset, so a conversion is needed from the
# old Doc to the new one.
# More info here: https://github.com/explosion/spaCy/issues/2532
arr = np.arange(len(doc))
new_index_to_old = arr[mask_to_del]
doc_offset_2_token = {tok.idx : tok.i for tok in doc} # needed for the user data
doc2_token_2_offset = {tok.i : tok.idx for tok in doc2} # needed for the user data
new_user_data = {}
for ((prefix, ext_name, offset, x), val) in doc.user_data.items():
old_token_index = doc_offset_2_token[offset]
new_token_index = np.where(new_index_to_old == old_token_index)[0]
if new_token_index.size == 0: # Case this index was deleted
continue
new_char_index = doc2_token_2_offset[new_token_index[0]]
new_user_data[(prefix, ext_name, new_char_index, x)] = val
doc2.user_data = new_user_data
return doc2
@yarongon
Copy link

Hi, I modified your code a little bit both for supporting user extensions and for performance.

def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]):
    """
    Remove tokens from a Spacy *Doc* object without losing 
    associated information (PartOfSpeech, Dependance, Lemma, extensions, ...)
    
    Parameters
    ----------
    doc : spacy.tokens.doc.Doc
        spacy representation of the text
    index_to_del : list of integer 
         positions of each token you want to delete from the document
    list_attr : list, optional
        Contains the Spacy attributes you want to keep (the default is 
        [LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP])
    Returns
    -------
    spacy.tokens.doc.Doc
        Filtered version of doc
    """
    
    np_array = doc.to_array(list_attr) # Array representation of Doc
    
    # Creating a mask: boolean array of the indexes to delete
    mask_to_del = np.ones(len(np_array), np.bool)
    mask_to_del[index_to_del] = 0
    
    np_array_2 = np_array[mask_to_del]
    doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del])
    doc2.from_array(list_attr, np_array_2)
    
    # Handling user extensions
    #  The `doc.user_data` dictionary is holding the data backing user-defined attributes.
    #  The data is based on characters offset, so a conversion is needed from the
    #  old Doc to the new one.
    #  More info here: https://github.com/explosion/spaCy/issues/2532
    arr = np.arange(len(doc))
    new_index_to_old = arr[mask_to_del]
    doc_offset_2_token = {tok.idx : tok.i  for tok in doc}  # needed for the user data
    doc2_token_2_offset = {tok.i : tok.idx  for tok in doc2}  # needed for the user data
    new_user_data = {}
    for ((prefix, ext_name, offset, x), val) in doc.user_data.items():
        old_token_index = doc_offset_2_token[offset]
        new_token_index = np.where(new_index_to_old == old_token_index)[0]
        if new_token_index.size == 0:  # Case this index was deleted
            continue
        new_char_index = doc2_token_2_offset[new_token_index[0]]
        new_user_data[(prefix, ext_name, new_char_index, x)] = val
    doc2.user_data = new_user_data
    
    return doc2

@joshlk
Copy link

joshlk commented Dec 2, 2019

You need to import: from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP

@aletelecom
Copy link

This function was very helpful for me, and really saved me time.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment