This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
from lxml import html | |
import csv | |
import requests | |
from time import sleep | |
import re | |
import argparse | |
import sys | |
import pandas as pd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def res_scraper(url): | |
driver = webdriver.Firefox(options=fireFoxOptions) | |
driver.get(url) | |
t.sleep(1) | |
page = driver.page_source | |
soup = BeautifulSoup(page, 'lxml') | |
soup2 = BeautifulSoup(page, 'html.parser') | |
final_data = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#using random delay for time.sleep() | |
delays = [7, 4, 6, 2, 10, 19] | |
delay = np.random.choice(delays) | |
def get_review(url, res_name, res_address): | |
binary = FirefoxBinary('/usr/bin/firefox') | |
opts = webdriver.FirefoxOptions() | |
opts.add_argument("--headless") | |
driver = webdriver.Firefox(firefox_binary=binary, firefox_options=opts ) | |
driver.get(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.proxy import Proxy, ProxyType | |
boston_res_info = pd.read_csv("combined_reviews_info13001.csv", encoding='utf-8-sig') | |
rev_urls = boston_res_info['reviewer_profile'].tolist() | |
delays = [1,2,3] | |
delay = np.random.choice(delays) | |
#keep changing proxy list based on https://sslproxies.org/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#program to combine txt.tsv and sub.tsv | |
import pandas as pd | |
# read csv data | |
df1 = pd.read_csv('/combinedSub_Tsv.tsv', sep = '\t') | |
df2 = pd.read_csv('/combinedTxt_Tsv.tsv', sep = '\t') | |
df1['aciks'] = df1['aciks'].astype(str) | |
df2_new = df2[['adsh', 'tag', 'series','class','value']].copy() | |
mask_df = df2_new['tag'].values == 'StrategyNarrativeTextBlock' | |
# new dataframe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read csv data | |
df1 = pd.read_csv('/Data_2010_Tsv_all.tsv', sep = '\t') | |
df1.reset_index(inplace=True) | |
df1 = df1.rename(columns = {'index':'id'}) | |
value_length_list =[] | |
for index, row in df1.iterrows(): | |
value_length_list.append(len(str(row['value']).strip())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#program to calculate 9 incides | |
import spacy | |
import textstat | |
from textstat.textstat import textstatistics | |
#from textstat import legacy_round,neasy_word_set | |
import pandas as pd | |
import pysentiment2 as ps | |
import math | |
import re |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import textstat | |
from textstat.textstat import textstatistics | |
import pandas as pd | |
import pysentiment2 as ps | |
import math | |
import re | |
from collections import Counter | |
import numpy as np |