Anna Widiger widiger-anna

## recipe_yelp_review.py
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
import re

'''
NLP tasks:
1. expand contractions to improve tokenization
2. filter stopwords and punctuation
3. track named entities (locations, names, money, etc.)

## recipe_tweet.py
from __future__ import unicode_literals, print_function

#pip install spacy && python -m spacy download en
import spacy

'''
NLP tasks:
1.tokenization
2.part of speech tagging
3.chunking

## clean_word.py
from __future__ import unicode_literals, print_function
import re

'''
For a token (word):
clean_words removes anything that is not alphanumeric (numbers, special characters)
remove_numbers substitutes a number with a placeholder NUMBER
'''
def clean_words(word):
  try:

## contractions_test.py
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from pathlib import Path
import re

nlp = spacy.load('en_core_web_sm')
# example of expanding contractions using regexes (slow for a big corpus)

## language_detection.py
from __future__ import unicode_literals, print_function
import os
'''
Example of analyzing a string using a sliding window (trigrams, n=3)
counts all occurencies of trigrams, stores in a dictionary
uses stats of trigram frequencies to identify a language
'''
def read_file(filename):
  if os.path.isfile(filename):
    fh = open(filename,'rb')

## spacyemoji_test.py
from __future__ import unicode_literals, print_function

# Install spaCy: pip install spacy && python -m spacy download en
import spacy
from spacymoji import Emoji

nlp = spacy.load('en_core_web_sm')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

## spellcheck_test.py
from __future__ import unicode_literals, print_function

# Install pyenchant: PyEnchant https://github.com/rfk/pyenchant
# Install spaCy: pip install spacy && python -m spacy download en

import spacy
import enchant

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"This sentence is grammatically correct. THis sentence definately has sum mispeled wordz.")
	from __future__ import unicode_literals, print_function
	import spacy
	from spacy.attrs import ORTH, LEMMA, NORM, TAG
	import re

	'''
	NLP tasks:
	1. expand contractions to improve tokenization
	2. filter stopwords and punctuation
	3. track named entities (locations, names, money, etc.)
	from __future__ import unicode_literals, print_function

	#pip install spacy && python -m spacy download en
	import spacy

	'''
	NLP tasks:
	1.tokenization
	2.part of speech tagging
	3.chunking
	from __future__ import unicode_literals, print_function
	import re

	'''
	For a token (word):
	clean_words removes anything that is not alphanumeric (numbers, special characters)
	remove_numbers substitutes a number with a placeholder NUMBER
	'''
	def clean_words(word):
	try:
	from __future__ import unicode_literals, print_function
	import os
	'''
	Example of analyzing a string using a sliding window (trigrams, n=3)
	counts all occurencies of trigrams, stores in a dictionary
	uses stats of trigram frequencies to identify a language
	'''
	def read_file(filename):
	if os.path.isfile(filename):
	fh = open(filename,'rb')
	from __future__ import unicode_literals, print_function

	# Install spaCy: pip install spacy && python -m spacy download en
	import spacy
	from spacymoji import Emoji

	nlp = spacy.load('en_core_web_sm')
	emoji = Emoji(nlp)
	nlp.add_pipe(emoji, first=True)
	from __future__ import unicode_literals, print_function

	# Install pyenchant: PyEnchant https://github.com/rfk/pyenchant
	# Install spaCy: pip install spacy && python -m spacy download en

	import spacy
	import enchant

	nlp = spacy.load('en_core_web_sm')
	doc = nlp(u"This sentence is grammatically correct. THis sentence definately has sum mispeled wordz.")