Skip to content

Instantly share code, notes, and snippets.

View widiger-anna's full-sized avatar

Anna Widiger widiger-anna

  • San Francisco
View GitHub Profile
@widiger-anna
widiger-anna / recipe_yelp_review.py
Last active May 4, 2018 18:32
NLP recipe 2: processing Yelp reviews with spaCy
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
import re
'''
NLP tasks:
1. expand contractions to improve tokenization
2. filter stopwords and punctuation
3. track named entities (locations, names, money, etc.)
@widiger-anna
widiger-anna / recipe_tweet.py
Last active May 4, 2018 18:32
NLP recipe 1: processing tweets with spaCy
from __future__ import unicode_literals, print_function
#pip install spacy && python -m spacy download en
import spacy
'''
NLP tasks:
1.tokenization
2.part of speech tagging
3.chunking
@widiger-anna
widiger-anna / clean_word.py
Last active May 4, 2018 18:38
Regex examples for removing numbers and special characters inside a word
from __future__ import unicode_literals, print_function
import re
'''
For a token (word):
clean_words removes anything that is not alphanumeric (numbers, special characters)
remove_numbers substitutes a number with a placeholder NUMBER
'''
def clean_words(word):
try:
@widiger-anna
widiger-anna / contractions_test.py
Created May 4, 2018 17:31
Expanding contractions with spaCy
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from pathlib import Path
import re
nlp = spacy.load('en_core_web_sm')
# example of expanding contractions using regexes (slow for a big corpus)
@widiger-anna
widiger-anna / language_detection.py
Last active May 5, 2018 19:29
Language Detection Test for EN and DE using trigrams
from __future__ import unicode_literals, print_function
import os
'''
Example of analyzing a string using a sliding window (trigrams, n=3)
counts all occurencies of trigrams, stores in a dictionary
uses stats of trigram frequencies to identify a language
'''
def read_file(filename):
if os.path.isfile(filename):
fh = open(filename,'rb')
@widiger-anna
widiger-anna / spacyemoji_test.py
Last active May 4, 2018 18:34
Testing spaCy tokenizer
from __future__ import unicode_literals, print_function
# Install spaCy: pip install spacy && python -m spacy download en
import spacy
from spacymoji import Emoji
nlp = spacy.load('en_core_web_sm')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)
@widiger-anna
widiger-anna / spellcheck_test.py
Last active May 4, 2018 18:36
Spell check for English using PyEnchant and spaCy
from __future__ import unicode_literals, print_function
# Install pyenchant: PyEnchant https://github.com/rfk/pyenchant
# Install spaCy: pip install spacy && python -m spacy download en
import spacy
import enchant
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"This sentence is grammatically correct. THis sentence definately has sum mispeled wordz.")