Skip to content

Instantly share code, notes, and snippets.

@bittlingmayer
Created March 9, 2016 18:05
Show Gist options
  • Save bittlingmayer/ba17969070c2749b478f to your computer and use it in GitHub Desktop.
Save bittlingmayer/ba17969070c2749b478f to your computer and use it in GitHub Desktop.
basic lang id based on NLTK stopwords
import nltk
# TODO: add more stopwords for more languages, so we can actually use this
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
def nltk_detect(q):
assert isinstance(q, unicode)
words = set([w.lower() for w in tokenizer.tokenize(q)])
counts = ((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items())
maxes = max(counts, key = lambda x: x[1])
print str(maxes[1])
return (maxes[0], maxes[1]) if maxes[1] else ('und', 0)
# It probably makes sense to normalise by length
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment