Skip to content

Instantly share code, notes, and snippets.

View shubham-singh-ss's full-sized avatar

Shubham Singh shubham-singh-ss

View GitHub Profile
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
v_category = "CATEGORY_NAME"
for x in links:
           driver.get(x)
           v_id = x.strip('https://www.youtube.com/watch?v=')
           v_title = wait.until(EC.presence_of_element_located(
                          (By.CSS_SELECTOR,"h1.title yt-formatted-string"))).text
           v_description =  wait.until(EC.presence_of_element_located(
                                        (By.CSS_SELECTOR,"div#description
                                         yt-formatted-string"))).text
frames = [df_travel, df_science, df_food, df_manufacturing, df_history, df_artndance]
df_copy = pd.concat(frames, axis=0, join='outer', join_axes=None, ignore_index=True,
                           keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df_link = pd.DataFrame(columns = ["link"])       
df_title = pd.DataFrame(columns = ["title"])       
df_description = pd.DataFrame(columns = ["description"])       
df_category = pd.DataFrame(columns = ["category"])       
df_link[‘link’] = df_copy['link']
df_title [‘title’]= df_copy['title']
df_description[‘description’] = df_copy['description']
df_category[‘category’] = df_copy['category']
corpus = []        
for i in range(0, 8375):         
review = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])            
review = review.lower()            
review = review.split()            
ps = PorterStemmer()            
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]            
review = ' '.join(review)            
corpus.append(review)
corpus1 = []
for i in range(0, 8375):            
review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i])            
review = review.lower()            
review = review.split()            
ps = PorterStemmer()            
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]            
review = ' '.join(review)            
corpus1.append(review)
dftitle = pd.DataFrame({'title':corpus})
dfdescription = pd.DataFrame({'description':corpus1})
from sklearn.preprocessing import LabelEncoder
dfcategory = df_category.apply(LabelEncoder().fit_transform)
df_new = pd.concat([df_link, dftitle, dfdescription, dfcategory], axis=1, join_axes = [df_link.index])