Skip to content

Instantly share code, notes, and snippets.

@sangheestyle
Last active September 2, 2022 10:37
Show Gist options
  • Save sangheestyle/8691435 to your computer and use it in GitHub Desktop.
Save sangheestyle/8691435 to your computer and use it in GitHub Desktop.
Removing punctuations, stop words, and stemming the contents with NLTK
import os
import json
import shutil
from subprocess import call
import cld
def read_json(file_path):
json_data = open(file_path)
data = json.load(json_data)
return data
def get_desc(file_path):
data = read_json(file_path)
description = data['extendedInfo']['description']
return description.encode('ascii', errors='ignore') # FIXME: workaround
def get_desc_from_folder(folder_path, desc_count=1000):
name_desc_pairs = {}
count = desc_count
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".json"):
if len(name_desc_pairs) < count:
# FIXME: unicode error \xc3\xa2\xc2\x99\xc2\xa3
desc = get_desc(os.path.join(root, file))
desc_utf8 = desc.encode('utf-8')
if len(desc) > 1000:
lang = cld.detect(desc_utf8)
if lang[1] == 'en' and len(lang[4]) == 1:
name_desc_pairs[file] = desc
return name_desc_pairs
folder_path = "data_google_play"
if not os.path.exists(folder_path):
call(["git", "clone", "https://github.com/sangheestyle/data_google_play.git"])
name_desc_pairs = get_desc_from_folder(folder_path)
documents = name_desc_pairs.values()
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.cluster import KMeansClusterer, euclidean_distance
from gensim import corpora, models, utils
from numpy import array
# Step 1:
# override LdaModel for changing output format
# from [(topicid, topicvalue)] to [topicvalue] due to format of KMeansClusterer
class MyLdaModel(models.ldamodel.LdaModel):
def __getitem__(self, bow, eps=0.01):
is_corpus, corpus = utils.is_corpus(bow)
if is_corpus:
return self._apply(corpus)
gamma, _ = self.inference([bow])
topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
# FIXME: if topicvalue >= eps]
# Step 2: removing whitespaces, punctuations, stopwords, and stemming words
processed = []
for document in documents:
tokenizer = RegexpTokenizer(r'\w+')
intermediate = tokenizer.tokenize(document)
stop = stopwords.words('english')
intermediate = [i for i in intermediate if i not in stop]
# FIXME: using other stemmers also to know quality of each stemmed text
lanste = LancasterStemmer()
intermediate = [lanste.stem(i) for i in intermediate]
processed.append(intermediate)
# Step 3
# making dictionary and corpus
dictionary = corpora.Dictionary(processed)
#dictionary.save('/tmp/dict.dict')
corpus = [dictionary.doc2bow(description) for description in processed]
#corpora.MmCorpus.serialize('/tmp/temp.mm', corpus)
# Step 4: LDA
num_topics = 5
model_lda = MyLdaModel(corpus, id2word=dictionary, num_topics=num_topics)
doc_lda = model_lda[corpus]
'''
for doc in doc_lda:
print doc
'''
# Step 5: k-means clustering
vectors = [array(f) for f in doc_lda]
clusterer = KMeansClusterer(num_topics, euclidean_distance, repeats=100, avoid_empty_clusters=True)
clusterer.cluster(vectors, True)
apps_per_topic = []
for x in range(num_topics):
apps_per_topic.append([])
# classify a new vector
apk_names = name_desc_pairs.keys()
for i, doc in enumerate(doc_lda):
topic_id = clusterer.classify(array(doc))
apps_per_topic[topic_id].append(apk_names[i])
# Step 6: make text for each topic
text_for_topics = []
for x in range(num_topics):
text_for_topics.append('')
apkname_stem_pairs = dict(zip(name_desc_pairs.keys(), processed))
for topic_id, names in enumerate(apps_per_topic):
for name in names:
# FIXME: there have two options for word cloud 1) pure descriptions 2) using stem processed
# text_for_topics[topic_id] = text_for_topics[topic_id] + " " + name_desc_pairs[name]
text = " ".join(apkname_stem_pairs[name])
text_for_topics[topic_id] = text_for_topics[topic_id] + text
output_path = "out"
if os.path.exists(output_path):
shutil.rmtree(output_path)
os.mkdir(output_path)
for topic_id, text_for_topic in enumerate(text_for_topics):
file_name = "topic-" + str(topic_id) + ".txt"
text_file = open(os.path.join(output_path, file_name), "w")
text_file.write(text_for_topic)
text_file.close()
# Step 7: word cloud - TBD
# FIXME: need to implement or just using http://www.wordle.net temporary
@sangheestyle
Copy link
Author

Result:

[['hum', 'machin', 'interfac', 'lab', 'abc', 'comput', 'apply'], ['a', 'survey', 'us', 'opin', 'comput', 'system', 'respons', 'tim'], ['the', 'ep', 'us', 'interfac', 'man', 'system'], ['system', 'hum', 'system', 'engin', 'test', 'ep'], ['rel', 'us', 'perceiv', 'respons', 'tim', 'er', 'meas'], ['the', 'gen', 'random', 'bin', 'unord', 'tre'], ['the', 'intersect', 'graph', 'path', 'tre'], ['graph', 'min', 'iv', 'width', 'tre', 'wel', 'quas', 'ord'], ['graph', 'min', 'a', 'survey'], ['the', 'gen', 'random', 'bin', 'unord', 'tre', 'survey']]

@sangheestyle
Copy link
Author

The next step is LDA, k-means clustering, visualization

@sangheestyle
Copy link
Author

Here is the 2nd version for LDA with gensim.

@sangheestyle
Copy link
Author

Result: 3 topics and 10 sentences (3 X 10)

[(0, 0.043487224655015692), (1, 0.91334320179268014), (2, 0.043169573552304011)]
[(0, 0.038937039435168998), (1, 0.041896434430366933), (2, 0.91916652613446403)]
[(0, 0.053246879732978467), (1, 0.050838987090679848), (2, 0.89591413317634161)]
[(0, 0.90083829551970018), (1, 0.048699129697301391), (2, 0.050462574782998426)]
[(0, 0.042326027355327193), (1, 0.91310314616436017), (2, 0.044570826480312589)]
[(0, 0.048260934238125504), (1, 0.048299573968107745), (2, 0.90343949179376681)]
[(0, 0.056756398955460384), (1, 0.057671474088390323), (2, 0.88557212695614929)]
[(0, 0.038675284258316643), (1, 0.91980941304997033), (2, 0.041515302691713184)]
[(0, 0.06859140004108788), (1, 0.072816298081204517), (2, 0.85859230187770763)]
[(0, 0.042202610067397242), (1, 0.042203849123989244), (2, 0.91559354080861355)]

@sangheestyle
Copy link
Author

The next step is applying k-mean clustering and visualization.

@sangheestyle
Copy link
Author

Result: cluster id for each sentence. For example 2nd and 5th sentence are same cluster.

1
0
2
1
0
2
1
1
1
2

@sangheestyle
Copy link
Author

The next step is applying visualization.

@sangheestyle
Copy link
Author

Reference till now:

www.nltk.org (nltk)
https://github.com/nltk/nltk (nltk)
http://radimrehurek.com/gensim (gensim)
https://github.com/piskvorky/gensim (gensim)
http://www.wikipedia.org (terms)

Natural Language Processing with Python (book for nltk)
Data Mining - Practical Machine Learning Tools (book for NLP and Weka)

@sangheestyle
Copy link
Author

Review revision 6:

It took so long time to fix "unicode" error with workaround. Also, I need to check line 55, 56 to know what is eps.

@sangheestyle
Copy link
Author

For generating word cloud, I will use pytagcloud.
https://pypi.python.org/pypi/pytagcloud

@sangheestyle
Copy link
Author

Upload 1365 json files including each apk's information from google play to github. Each file includes more than 1000 charecters of description.

https://github.com/sangheestyle/data_google_play.git

@sangheestyle
Copy link
Author

Revisions 8:

  • remove some package importings

@sangheestyle
Copy link
Author

Revison 9:

  • download sample json files from github for testing (about 1000 json files)
  • generate text files including every descripton for same topic (e.g. out/topic-0.txt)

@roumaissabensaid
Copy link

how to apply this on an excel file, plz I'm blocked

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment