Skip to content

Instantly share code, notes, and snippets.

View shreya-singh-tech's full-sized avatar

Shreya shreya-singh-tech

View GitHub Profile
@shreya-singh-tech
shreya-singh-tech / yelp_step1.py
Created August 7, 2021 15:13
Script to extract and organize yelp business URLs.
from bs4 import BeautifulSoup
import requests
from lxml import html
import csv
import requests
from time import sleep
import re
import argparse
import sys
import pandas as pd
@shreya-singh-tech
shreya-singh-tech / yelp_restaurant.py
Created August 7, 2021 15:33
Python Program to get Yelp Restaurants' basic info
def res_scraper(url):
driver = webdriver.Firefox(options=fireFoxOptions)
driver.get(url)
t.sleep(1)
page = driver.page_source
soup = BeautifulSoup(page, 'lxml')
soup2 = BeautifulSoup(page, 'html.parser')
final_data = []
@shreya-singh-tech
shreya-singh-tech / yelp_reviews.py
Created August 7, 2021 16:22
Python Program to scrap Yelp Reviews.
#using random delay for time.sleep()
delays = [7, 4, 6, 2, 10, 19]
delay = np.random.choice(delays)
def get_review(url, res_name, res_address):
binary = FirefoxBinary('/usr/bin/firefox')
opts = webdriver.FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(firefox_binary=binary, firefox_options=opts )
driver.get(url)
@shreya-singh-tech
shreya-singh-tech / yelp_reviewer.py
Created August 7, 2021 16:34
Python Program to extract Yelp reviewer profile
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType
boston_res_info = pd.read_csv("combined_reviews_info13001.csv", encoding='utf-8-sig')
rev_urls = boston_res_info['reviewer_profile'].tolist()
delays = [1,2,3]
delay = np.random.choice(delays)
#keep changing proxy list based on https://sslproxies.org/
@shreya-singh-tech
shreya-singh-tech / Join_txt_sub.py
Created August 14, 2021 21:52
Program to join csv
#program to combine txt.tsv and sub.tsv
import pandas as pd
# read csv data
df1 = pd.read_csv('/combinedSub_Tsv.tsv', sep = '\t')
df2 = pd.read_csv('/combinedTxt_Tsv.tsv', sep = '\t')
df1['aciks'] = df1['aciks'].astype(str)
df2_new = df2[['adsh', 'tag', 'series','class','value']].copy()
mask_df = df2_new['tag'].values == 'StrategyNarrativeTextBlock'
# new dataframe
# read csv data
df1 = pd.read_csv('/Data_2010_Tsv_all.tsv', sep = '\t')
df1.reset_index(inplace=True)
df1 = df1.rename(columns = {'index':'id'})
value_length_list =[]
for index, row in df1.iterrows():
value_length_list.append(len(str(row['value']).strip()))
@shreya-singh-tech
shreya-singh-tech / textStat.py
Last active August 15, 2021 03:58
Calculate sentiment and readabilty
#program to calculate 9 incides
import spacy
import textstat
from textstat.textstat import textstatistics
#from textstat import legacy_round,neasy_word_set
import pandas as pd
import pysentiment2 as ps
import math
import re
@shreya-singh-tech
shreya-singh-tech / cos_sim.py
Last active August 15, 2021 03:57
Program to find cosine similaity
import spacy
import textstat
from textstat.textstat import textstatistics
import pandas as pd
import pysentiment2 as ps
import math
import re
from collections import Counter
import numpy as np