ryanbateman/ddg.ipyb

## ddg.ipyb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "235a092e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#importing required libraries\n",
    "from google_play_scraper import app, Sort, reviews_all\n",
    "from os.path import exists\n",
    "import json\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pprint import pprint\n",
    "import gensim\n",
    "import gensim.corpora as corpora\n",
    "from gensim.utils import simple_preprocess\n",
    "from nltk.corpus import stopwords\n",
    "from gensim.models import CoherenceModel\n",
    "import spacy\n",
    "import pyLDAvis\n",
    "import pyLDAvis.gensim_models\n",
    "import matplotlib.pyplot as plt\n",
    "import nltk\n",
    "from nltk import FreqDist\n",
    "from matplotlib import pyplot as plt\n",
    "from wordcloud import WordCloud, STOPWORDS\n",
    "import matplotlib.colors as mcolors\n",
    "import spacy\n",
    "\n",
    "nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])\n",
    "\n",
    "#importing the Stopwords to use them, tidying a little and including some domain/superfluous stuff\n",
    "stop_words = stopwords.words('english')\n",
    "stop_words.extend(['duckduckgo', 'duck', 'go', 'duckduck', 'browser', 'ddg', 'app', 'good', 'great', \n",
    "                   'search', 'engine', 'use', 'nice', 'chrome', ''])\n",
    "\n",
    "#downloading the data, storing it in text file (also used by the R script)\n",
    "if not exists('reviews.txt'):\n",
    "    result = reviews_all('com.duckduckgo.mobile.android',\n",
    "            sleep_milliseconds=0,\n",
    "            lang='en', \n",
    "            country='us'\n",
    "           )\n",
    "    file = open('reviews.txt', 'w')\n",
    "    hopeful_json = json.dump(result, file, indent=4, sort_keys=True, default=str)\n",
    "    file.close()\n",
    "        \n",
    "with open(\"reviews.txt\", \"r\") as read_file:\n",
    "    review_json = json.load(read_file)\n",
    "    # Concatenate reviews, strip some punctuation \n",
    "    reviews_content = [ review['content'] for review in review_json if type(review['content']) == str ]    \n",
    "\n",
    "data = [r.lower() for r in reviews_content]\n",
    "print(\"Approximate number of reviews: \", len(data))\n",
    "    \n",
    "#cleaning the text \n",
    "def tokeniz(sentences):\n",
    "    for sentence in sentences:\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
    "processed_data = list(tokeniz(data))\n",
    "\n",
    "#Building bigram & trigram Models\n",
    "bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=50)\n",
    "trigram = gensim.models.Phrases(bigram[processed_data], threshold=50)\n",
    "bigram_mod = gensim.models.phrases.Phraser(bigram)\n",
    "trigram_mod = gensim.models.phrases.Phraser(trigram)\n",
    "\n",
    "#function to filter out stopwords\n",
    "def remove_stopwords(texts):\n",
    "    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]\n",
    "\n",
    "#function to create bigrams\n",
    "def create_bigrams(texts):\n",
    "    return [bigram_mod[doc] for doc in texts]\n",
    "\n",
    "#function to create trigrams\n",
    "def create_trigrams(texts):\n",
    "    [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
    "\n",
    "#function for lemmatization\n",
    "def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):\n",
    "    texts_op = []\n",
    "    for sent in texts:\n",
    "        doc = nlp(\" \".join(sent))\n",
    "        texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n",
    "    return texts_op\n",
    "\n",
    "#removing stopwords, creating bigrams and lemmatizing the text\n",
    "data_wo_stopwords = remove_stopwords(processed_data)\n",
    "data_bigrams = create_bigrams(data_wo_stopwords)\n",
    "data_lemmatized = lemmatize(data_bigrams, allowed_postags=[ 'NOUN', 'ADJ'])\n",
    "print(trigram_mod[bigram_mod[texts]])\n",
    "\n",
    "#printing the lemmatized data\n",
    "print(data_lemmatized[:3])\n",
    "\n",
    "#creating a dictionary\n",
    "gensim_dictionary = corpora.Dictionary(data_lemmatized)\n",
    "\n",
    "texts = data_lemmatized\n",
    "\n",
    "#building a corpus for the topic model\n",
    "gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]\n",
    "\n",
    "#printing the corpus we created above.\n",
    "print(gensim_corpus[:3]) \n",
    "\n",
    "#we can print the words with their frequencies.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe541fbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#creating the LDA model (100 passes produced the cleanest result but took forever)\n",
    "lda_model = gensim.models.ldamodel.LdaModel(\n",
    "   corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=10, random_state=100, chunksize=100, passes=10)\n",
    "\n",
    "#calculating the coherence\n",
    "coherence_model_lda = CoherenceModel(\n",
    "   model=lda_model, texts=data_lemmatized, dictionary=gensim_dictionary, coherence='c_v')\n",
    "coherence_lda = coherence_model_lda.get_coherence()\n",
    "\n",
    "# Display Perplexity (low is good) and coherence (high is good)\n",
    "print('\\nPerplexity: ', lda_model.log_perplexity(gensim_corpus))\n",
    "print('\\nCoherence Score: ', coherence_lda)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b55aaf5f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Show the notebook\n",
    "pyLDAvis.enable_notebook()\n",
    "visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, mds='mmds')\n",
    "visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b4ee84",
   "metadata": {},
   "outputs": [],
   "source": [
    "# For fun/easier understanding, print out some wordclouds\n",
    "cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'\n",
    "cloud = WordCloud(stopwords=stop_words,\n",
    "                  background_color='white',\n",
    "                  width=2500,\n",
    "                  height=1800,\n",
    "                  max_words=10,\n",
    "                  colormap='tab10',\n",
    "                  color_func=lambda *args, **kwargs: cols[i],\n",
    "                  prefer_horizontal=1.0)\n",
    "\n",
    "topics = lda_model.show_topics(formatted=False)\n",
    "print(len(topics))\n",
    "\n",
    "fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True)\n",
    "\n",
    "for i, ax in enumerate(axes.flatten()):\n",
    "    fig.add_subplot(ax)\n",
    "    topic_words = dict(topics[i][1])\n",
    "    cloud.generate_from_frequencies(topic_words, max_font_size=300)\n",
    "    plt.gca().imshow(cloud)\n",
    "    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))\n",
    "    plt.gca().axis('off')\n",
    "\n",
    "\n",
    "plt.subplots_adjust(wspace=0, hspace=0)\n",
    "plt.axis('off')\n",
    "plt.margins(x=0, y=0)\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "235a092e",
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"#importing required libraries\n",
	"from google_play_scraper import app, Sort, reviews_all\n",
	"from os.path import exists\n",
	"import json\n",
	"import re\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"from pprint import pprint\n",
	"import gensim\n",
	"import gensim.corpora as corpora\n",
	"from gensim.utils import simple_preprocess\n",
	"from nltk.corpus import stopwords\n",
	"from gensim.models import CoherenceModel\n",
	"import spacy\n",
	"import pyLDAvis\n",
	"import pyLDAvis.gensim_models\n",
	"import matplotlib.pyplot as plt\n",
	"import nltk\n",
	"from nltk import FreqDist\n",
	"from matplotlib import pyplot as plt\n",
	"from wordcloud import WordCloud, STOPWORDS\n",
	"import matplotlib.colors as mcolors\n",
	"import spacy\n",
	"\n",
	"nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])\n",
	"\n",
	"#importing the Stopwords to use them, tidying a little and including some domain/superfluous stuff\n",
	"stop_words = stopwords.words('english')\n",
	"stop_words.extend(['duckduckgo', 'duck', 'go', 'duckduck', 'browser', 'ddg', 'app', 'good', 'great', \n",
	" 'search', 'engine', 'use', 'nice', 'chrome', ''])\n",
	"\n",
	"#downloading the data, storing it in text file (also used by the R script)\n",
	"if not exists('reviews.txt'):\n",
	" result = reviews_all('com.duckduckgo.mobile.android',\n",
	" sleep_milliseconds=0,\n",
	" lang='en', \n",
	" country='us'\n",
	" )\n",
	" file = open('reviews.txt', 'w')\n",
	" hopeful_json = json.dump(result, file, indent=4, sort_keys=True, default=str)\n",
	" file.close()\n",
	" \n",
	"with open(\"reviews.txt\", \"r\") as read_file:\n",
	" review_json = json.load(read_file)\n",
	" # Concatenate reviews, strip some punctuation \n",
	" reviews_content = [ review['content'] for review in review_json if type(review['content']) == str ] \n",
	"\n",
	"data = [r.lower() for r in reviews_content]\n",
	"print(\"Approximate number of reviews: \", len(data))\n",
	" \n",
	"#cleaning the text \n",
	"def tokeniz(sentences):\n",
	" for sentence in sentences:\n",
	" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
	"processed_data = list(tokeniz(data))\n",
	"\n",
	"#Building bigram & trigram Models\n",
	"bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=50)\n",
	"trigram = gensim.models.Phrases(bigram[processed_data], threshold=50)\n",
	"bigram_mod = gensim.models.phrases.Phraser(bigram)\n",
	"trigram_mod = gensim.models.phrases.Phraser(trigram)\n",
	"\n",
	"#function to filter out stopwords\n",
	"def remove_stopwords(texts):\n",
	" return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]\n",
	"\n",
	"#function to create bigrams\n",
	"def create_bigrams(texts):\n",
	" return [bigram_mod[doc] for doc in texts]\n",
	"\n",
	"#function to create trigrams\n",
	"def create_trigrams(texts):\n",
	" [trigram_mod[bigram_mod[doc]] for doc in texts]\n",
	"\n",
	"#function for lemmatization\n",
	"def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):\n",
	" texts_op = []\n",
	" for sent in texts:\n",
	" doc = nlp(\" \".join(sent))\n",
	" texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n",
	" return texts_op\n",
	"\n",
	"#removing stopwords, creating bigrams and lemmatizing the text\n",
	"data_wo_stopwords = remove_stopwords(processed_data)\n",
	"data_bigrams = create_bigrams(data_wo_stopwords)\n",
	"data_lemmatized = lemmatize(data_bigrams, allowed_postags=[ 'NOUN', 'ADJ'])\n",
	"print(trigram_mod[bigram_mod[texts]])\n",
	"\n",
	"#printing the lemmatized data\n",
	"print(data_lemmatized[:3])\n",
	"\n",
	"#creating a dictionary\n",
	"gensim_dictionary = corpora.Dictionary(data_lemmatized)\n",
	"\n",
	"texts = data_lemmatized\n",
	"\n",
	"#building a corpus for the topic model\n",
	"gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]\n",
	"\n",
	"#printing the corpus we created above.\n",
	"print(gensim_corpus[:3]) \n",
	"\n",
	"#we can print the words with their frequencies.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "fe541fbf",
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"#creating the LDA model (100 passes produced the cleanest result but took forever)\n",
	"lda_model = gensim.models.ldamodel.LdaModel(\n",
	" corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=10, random_state=100, chunksize=100, passes=10)\n",
	"\n",
	"#calculating the coherence\n",
	"coherence_model_lda = CoherenceModel(\n",
	" model=lda_model, texts=data_lemmatized, dictionary=gensim_dictionary, coherence='c_v')\n",
	"coherence_lda = coherence_model_lda.get_coherence()\n",
	"\n",
	"# Display Perplexity (low is good) and coherence (high is good)\n",
	"print('\\nPerplexity: ', lda_model.log_perplexity(gensim_corpus))\n",
	"print('\\nCoherence Score: ', coherence_lda)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b55aaf5f",
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"# Show the notebook\n",
	"pyLDAvis.enable_notebook()\n",
	"visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, mds='mmds')\n",
	"visualization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "82b4ee84",
	"metadata": {},
	"outputs": [],
	"source": [
	"# For fun/easier understanding, print out some wordclouds\n",
	"cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'\n",
	"cloud = WordCloud(stopwords=stop_words,\n",
	" background_color='white',\n",
	" width=2500,\n",
	" height=1800,\n",
	" max_words=10,\n",
	" colormap='tab10',\n",
	" color_func=lambda args, *kwargs: cols[i],\n",
	" prefer_horizontal=1.0)\n",
	"\n",
	"topics = lda_model.show_topics(formatted=False)\n",
	"print(len(topics))\n",
	"\n",
	"fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True)\n",
	"\n",
	"for i, ax in enumerate(axes.flatten()):\n",
	" fig.add_subplot(ax)\n",
	" topic_words = dict(topics[i][1])\n",
	" cloud.generate_from_frequencies(topic_words, max_font_size=300)\n",
	" plt.gca().imshow(cloud)\n",
	" plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))\n",
	" plt.gca().axis('off')\n",
	"\n",
	"\n",
	"plt.subplots_adjust(wspace=0, hspace=0)\n",
	"plt.axis('off')\n",
	"plt.margins(x=0, y=0)\n",
	"plt.tight_layout()\n",
	"plt.show()\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}