Skip to content

Instantly share code, notes, and snippets.

@SuzanaK
Created March 26, 2013 14:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SuzanaK/5245578 to your computer and use it in GitHub Desktop.
Save SuzanaK/5245578 to your computer and use it in GitHub Desktop.
NLTK Code Snippets
# Language detection for German and English, using stopwords from the python NLTK
import nltk
# removed in because it was too common
COMMON_STOPWORDS = set(['am', 'an', 'so', 'was', 'will'])
GERMAN_STOPWORDS = set(nltk.corpus.stopwords.words('german')).difference(COMMON_STOPWORDS)
ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')).difference(COMMON_STOPWORDS)
# returns "english", "german" or "other", text must be a string
# nltk can handle unicode but not encoded utf-8 strings
def detect_language(sentences):
counter_german = 0
counter_english = 0
for s in sentences:
words = nltk.word_tokenize(s)
words_german = [w for w in words if w in GERMAN_STOPWORDS]
words_english = [w for w in words if w in ENGLISH_STOPWORDS]
counter_german += len(words_german)
counter_english += len(words_english)
if counter_german == 0 and counter_english == 0:
language = "other"
elif counter_german >= counter_english:
language = "german"
else:
language = "english"
return language
import nltk
# reader is a NLTK corpus reader
cut = int(0.9 * len(reader.tagged_sents()))
trains = reader.tagged_sents()[:cut]
tests = reader.tagged_sents()[cut+1:]
tagger = nltk.tag.ClassifierBasedPOSTagger(train=trains, cutoff_prob=0.95, verbose=True)
tagger.evaluate(tests)
# various NLTK code snippets
import nltk
tagger = nltk.tag.stanford.POSTagger('../../tagger/stanford/models/german-fast.tagger', \
'../../tagger/stanford/stanford-postagger.jar')
# string needs to be utf-8 encoded for this tagger model
tagged_sentences = [tagger.tag(nltk.word_tokenize(s.encode('utf-8'))) for s in sentences]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment