Skip to content

Instantly share code, notes, and snippets.

@fherbine
Created June 19, 2020 11:35
Show Gist options
  • Save fherbine/d84b8650c80af778a380d8ff3f253c1d to your computer and use it in GitHub Desktop.
Save fherbine/d84b8650c80af778a380d8ff3f253c1d to your computer and use it in GitHub Desktop.
Reproduction (in french) of an exercise in: NLP - Crash course AI
""" French cat AI.
This code is inspired from crash course AI about NLP:
https://www.youtube.com/watch?v=oi0JXuL19TA
"""
__author__ = 'fherbine'
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from tabulate import tabulate
bs_parser = 'html.parser'
stemmer = SnowballStemmer(language='french')
def word_frequency(tokenized_words):
fdist = nltk.FreqDist(tokenized_words)
print(tabulate(fdist.most_common(10), headers=['word', 'frequency']))
# get wikis of cat and Felidae in french.
french_cat_wiki = requests.get('https://fr.wikipedia.org/wiki/Chat')
french_felinae_wiki = requests.get('https://fr.wikipedia.org/wiki/Felinae')
# Scrapping the content from webpages
french_cat_content = BeautifulSoup(french_cat_wiki.content, bs_parser)
french_felinae_content = BeautifulSoup(french_felinae_wiki.content, bs_parser)
# Extracting text, and tokenize them in a very simple way
cat_text = '\n'.join(
[p.text for p in french_cat_content.find_all('p')]
).split()
felinae_text = '\n'.join(
[p.text for p in french_felinae_content.find_all('p')]
).split()
# Applying stemming over our tokenized sentences
cat_text = [stemmer.stem(word) for word in cat_text]
felinae_text = [stemmer.stem(word) for word in felinae_text]
# removes french stopwords of the texts
french_sw = set(stopwords.words('french'))
cat_text = [word for word in cat_text if word not in french_sw]
felinae_text = [word for word in felinae_text if word not in french_sw]
word_frequency(cat_text)
word_frequency(felinae_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment