Monicamundada Monicamundada

## youtube_parameters.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Monicamundada
                / youtube_parameters.ipynb
            
            
              Last active
              June 24, 2020 13:27
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## details.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Monicamundada
                / details.ipynb
            
            
              Last active
              June 24, 2020 13:34
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Webscraping.py
#import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

from time import sleep
from random import randint

#Range of pages #There are total 792 pages to scrape from the URL.

## import.py
# Run in python console
import nltk; nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint
import json

# Gensim

## stop_words.py
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])

## stop_words.py
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])

## stop_words.py
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])

## list.py
#Reading the input csv file and considering 'Description' column for analysis
df = pd.read_csv('blog_medium.csv',error_bad_lines=False,skipinitialspace=False,sep=';',index_col=0)
print(df.Description.unique())
df

# Convert to list
df['data'] = df.Description.tolist()
data=df['data']

## preprocess.py
# Pre-processing steps for data

df['data'] = [re.sub('\s*@\s*\s?', ' ', str(sent)) for sent in df['data']]

df['data'] = [re.sub('\?', ' ', str(sent)) for sent in df['data']]

df['data'] = [re.sub('\_', ' ', str(sent)) for sent in df['data']]

df['data'] = [re.sub('@"[\d-]"', ' ', str(sent)) for sent in df['data']]

## gram.py
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df['data']))

print(data_words[:1])

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
	#import libraries
	from bs4 import BeautifulSoup
	import requests
	import pandas as pd
	import numpy as np

	from time import sleep
	from random import randint

	#Range of pages #There are total 792 pages to scrape from the URL.
	# Run in python console
	import nltk; nltk.download('stopwords')

	import re
	import numpy as np
	import pandas as pd
	from pprint import pprint
	import json

	# Gensim
	# NLTK Stop words
	from nltk.corpus import stopwords
	stop_words = stopwords.words('english')
	stop_words.extend(['medium', 'publication', 'article', 'live','platform','story','write'])
	#Reading the input csv file and considering 'Description' column for analysis
	df = pd.read_csv('blog_medium.csv',error_bad_lines=False,skipinitialspace=False,sep=';',index_col=0)
	print(df.Description.unique())
	df

	# Convert to list
	df['data'] = df.Description.tolist()
	data=df['data']
	# Pre-processing steps for data

	df['data'] = [re.sub('\s@\s\s?', ' ', str(sent)) for sent in df['data']]

	df['data'] = [re.sub('\?', ' ', str(sent)) for sent in df['data']]

	df['data'] = [re.sub('\_', ' ', str(sent)) for sent in df['data']]

	df['data'] = [re.sub('@"[\d-]"', ' ', str(sent)) for sent in df['data']]
	def sent_to_words(sentences):
	for sentence in sentences:
	yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations

	data_words = list(sent_to_words(df['data']))

	print(data_words[:1])

	# Build the bigram and trigram models
	bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.