-
-
Save bittlingmayer/ba17969070c2749b478f to your computer and use it in GitHub Desktop.
basic lang id based on NLTK stopwords
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
# TODO: add more stopwords for more languages, so we can actually use this | |
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()} | |
from nltk.tokenize import TreebankWordTokenizer | |
tokenizer = TreebankWordTokenizer() | |
def nltk_detect(q): | |
assert isinstance(q, unicode) | |
words = set([w.lower() for w in tokenizer.tokenize(q)]) | |
counts = ((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()) | |
maxes = max(counts, key = lambda x: x[1]) | |
print str(maxes[1]) | |
return (maxes[0], maxes[1]) if maxes[1] else ('und', 0) | |
# It probably makes sense to normalise by length |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment