Last active
April 9, 2021 08:43
-
-
Save jul/c57774d4bd4cfa681c59afa74d13a043 to your computer and use it in GitHub Desktop.
Tokenization, stemmisation en Français avec ce pot de pue appelé NLTK.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# vim: set fileencoding=utf-8 : | |
import nltk | |
import nltk.data | |
import re | |
from nltk.corpus import stopwords | |
from nltk.stem.snowball import SnowballStemmer | |
from vector_dict.VectorDict import VectorDict | |
from nltk import word_tokenize as wt | |
sp_pattern = re.compile( """[\.\!\"\s\?\-\,\']+""", re.M) | |
stupid_tokenizer = sp_pattern.split | |
french_stopwords = set(stopwords.words('french')) | |
filt_out = lambda text: [token for token in text if token.lower() not in french_stopwords] | |
fr_stop = lambda token: len(token) and token.lower() not in french_stopwords | |
data = u"""Nous recherchons -pour les besoins d'une société en plein essor- un petit jeune passionné, | |
plein d'entrain, pour travailler dans un domaine intellectuellement stimulant.""" | |
data2 = u"""pour le compte d'une société en plein essor, nous recherchons un jeune qui veut se faire exploiter""" | |
data3 = u"""Nous avons un vrai métier, et on a besoin de produire | |
""" | |
## we are brave, we execute arbitrary code from unchecked origin \o/ | |
## this code does a nltk.data.load('tokenizers/punkt/french.pickle') in your back | |
## executing unknown code from unchecked origin. I do not advise to do so! | |
print "//".join(filt_out( wt(data, language="french"))) | |
### let's see if a regexp does better | |
print "//".join(filt_out( stupid_tokenizer(data))) | |
stemmer = SnowballStemmer("french", ignore_stopwords=True) | |
stemmer2 = SnowballStemmer("french", ignore_stopwords=False) | |
print "//".join( | |
map( | |
stemmer2.stem, filter( | |
fr_stop, | |
stupid_tokenizer(data) | |
) | |
) | |
) | |
final_vectorizer = lambda text: reduce( | |
VectorDict.__add__, | |
map( | |
lambda w: VectorDict(dict, { w:1 }), | |
map( | |
stemmer2.stem, | |
filter( | |
fr_stop, | |
stupid_tokenizer(text) | |
) | |
) | |
) | |
) | |
sample = final_vectorizer(data) | |
startup = final_vectorizer(data2) | |
normal_company = final_vectorizer(data3) | |
sample.tprint() | |
startup.tprint() | |
normal_company.tprint() | |
print "Similarité startup : %.3f" % sample.jaccard(startup) | |
print "Similarité client final: %.3f" % sample.jaccard(normal_company) | |
""" Rés: | |
recherchons//-pour//les//besoins//d'une//société//plein//essor-//petit//jeune//passionné//,//plein//d'entrain//,//travailler//domaine//intellectuellement//stimulant//. | |
recherchons//les//besoins//société//plein//essor//petit//jeune//passionné//plein//entrain//travailler//domaine//intellectuellement//stimulant// | |
recherchon//le//besoin//societ//plein//essor//pet//jeun//passion//plein//entrain//travaill//domain//intellectuel//stimul | |
{ | |
u'domain' : 1, | |
u'le' : 1, | |
u'plein' : 2, | |
u'jeun' : 1, | |
u'recherchon' : 1, | |
u'pet' : 1, | |
u'stimul' : 1, | |
u'travaill' : 1, | |
u'entrain' : 1, | |
u'societ' : 1, | |
u'passion' : 1, | |
u'besoin' : 1, | |
u'essor' : 1, | |
u'intellectuel' : 1, | |
} | |
{ | |
u'societ' : 1, | |
u'plein' : 1, | |
u'jeun' : 1, | |
u'recherchon' : 1, | |
u'veut' : 1, | |
u'exploit' : 1, | |
u'compt' : 1, | |
u'essor' : 1, | |
u'fair' : 1, | |
} | |
{ | |
u'a' : 1, | |
u'produir' : 1, | |
u'vrai' : 1, | |
u'besoin' : 1, | |
u'm\xe9ti' : 1, | |
} | |
Similarité startup : 0.300 | |
Similarité client final: 0.048 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment