Skip to content

Instantly share code, notes, and snippets.

View Monicamundada's full-sized avatar
☺️

Monicamundada Monicamundada

☺️
  • CBS
  • denmark
View GitHub Profile
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
#Range of pages #There are total 792 pages to scrape from the URL.
# Run in python console
import nltk; nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint
import json
# Gensim
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])
#Reading the input csv file and considering 'Description' column for analysis
df = pd.read_csv('blog_medium.csv',error_bad_lines=False,skipinitialspace=False,sep=';',index_col=0)
print(df.Description.unique())
df
# Convert to list
df['data'] = df.Description.tolist()
data=df['data']
# Pre-processing steps for data
df['data'] = [re.sub('\s*@\s*\s?', ' ', str(sent)) for sent in df['data']]
df['data'] = [re.sub('\?', ' ', str(sent)) for sent in df['data']]
df['data'] = [re.sub('\_', ' ', str(sent)) for sent in df['data']]
df['data'] = [re.sub('@"[\d-]"', ' ', str(sent)) for sent in df['data']]
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(df['data']))
print(data_words[:1])
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.