fherbine/french_cat_ai.py

## french_cat_ai.py
""" French cat AI.

This code is inspired from crash course AI about NLP:
https://www.youtube.com/watch?v=oi0JXuL19TA
"""


__author__ = 'fherbine'
import requests


from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from tabulate import tabulate

bs_parser = 'html.parser'
stemmer = SnowballStemmer(language='french')


def word_frequency(tokenized_words):
    fdist = nltk.FreqDist(tokenized_words)
    print(tabulate(fdist.most_common(10), headers=['word', 'frequency']))


# get wikis of cat and Felidae in french.
french_cat_wiki = requests.get('https://fr.wikipedia.org/wiki/Chat')
french_felinae_wiki = requests.get('https://fr.wikipedia.org/wiki/Felinae')

# Scrapping the content from webpages
french_cat_content = BeautifulSoup(french_cat_wiki.content, bs_parser)
french_felinae_content = BeautifulSoup(french_felinae_wiki.content, bs_parser)

# Extracting text, and tokenize them in a very simple way
cat_text = '\n'.join(
    [p.text for p in french_cat_content.find_all('p')]
).split()
felinae_text = '\n'.join(
    [p.text for p in french_felinae_content.find_all('p')]
).split()

# Applying stemming over our tokenized sentences
cat_text = [stemmer.stem(word) for word in cat_text]
felinae_text = [stemmer.stem(word) for word in felinae_text]

# removes french stopwords of the texts
french_sw = set(stopwords.words('french'))
cat_text = [word for word in cat_text if word not in french_sw]
felinae_text = [word for word in felinae_text if word not in french_sw]

word_frequency(cat_text)
word_frequency(felinae_text)
	""" French cat AI.

	This code is inspired from crash course AI about NLP:
	https://www.youtube.com/watch?v=oi0JXuL19TA
	"""


	__author__ = 'fherbine'
	import requests


	from bs4 import BeautifulSoup
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem.snowball import SnowballStemmer
	from tabulate import tabulate

	bs_parser = 'html.parser'
	stemmer = SnowballStemmer(language='french')


	def word_frequency(tokenized_words):
	fdist = nltk.FreqDist(tokenized_words)
	print(tabulate(fdist.most_common(10), headers=['word', 'frequency']))


	# get wikis of cat and Felidae in french.
	french_cat_wiki = requests.get('https://fr.wikipedia.org/wiki/Chat')
	french_felinae_wiki = requests.get('https://fr.wikipedia.org/wiki/Felinae')

	# Scrapping the content from webpages
	french_cat_content = BeautifulSoup(french_cat_wiki.content, bs_parser)
	french_felinae_content = BeautifulSoup(french_felinae_wiki.content, bs_parser)

	# Extracting text, and tokenize them in a very simple way
	cat_text = '\n'.join(
	[p.text for p in french_cat_content.find_all('p')]
	).split()
	felinae_text = '\n'.join(
	[p.text for p in french_felinae_content.find_all('p')]
	).split()

	# Applying stemming over our tokenized sentences
	cat_text = [stemmer.stem(word) for word in cat_text]
	felinae_text = [stemmer.stem(word) for word in felinae_text]

	# removes french stopwords of the texts
	french_sw = set(stopwords.words('french'))
	cat_text = [word for word in cat_text if word not in french_sw]
	felinae_text = [word for word in felinae_text if word not in french_sw]

	word_frequency(cat_text)
	word_frequency(felinae_text)