Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #import libraries | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import pandas as pd | |
| import numpy as np | |
| from time import sleep | |
| from random import randint | |
| #Range of pages #There are total 792 pages to scrape from the URL. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Run in python console | |
| import nltk; nltk.download('stopwords') | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| from pprint import pprint | |
| import json | |
| # Gensim |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # NLTK Stop words | |
| from nltk.corpus import stopwords | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # NLTK Stop words | |
| from nltk.corpus import stopwords | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # NLTK Stop words | |
| from nltk.corpus import stopwords | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Reading the input csv file and considering 'Description' column for analysis | |
| df = pd.read_csv('blog_medium.csv',error_bad_lines=False,skipinitialspace=False,sep=';',index_col=0) | |
| print(df.Description.unique()) | |
| df | |
| # Convert to list | |
| df['data'] = df.Description.tolist() | |
| data=df['data'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Pre-processing steps for data | |
| df['data'] = [re.sub('\s*@\s*\s?', ' ', str(sent)) for sent in df['data']] | |
| df['data'] = [re.sub('\?', ' ', str(sent)) for sent in df['data']] | |
| df['data'] = [re.sub('\_', ' ', str(sent)) for sent in df['data']] | |
| df['data'] = [re.sub('@"[\d-]"', ' ', str(sent)) for sent in df['data']] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def sent_to_words(sentences): | |
| for sentence in sentences: | |
| yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations | |
| data_words = list(sent_to_words(df['data'])) | |
| print(data_words[:1]) | |
| # Build the bigram and trigram models | |
| bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. |
OlderNewer