monspo1/proj3_webscrape_freq_words.py

## proj3_webscrape_freq_words.py
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv')
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv')

df_for_review = pd.DataFrame() # new data frame
comp_set = set() # for duplicate check

for i in range(0,len(df_received_for_review)):
    target_comp_name = df_received_for_review.iloc[i]['comp_name']

    url_2nd = df_received_for_review.iloc[i]['overall_link']

    if url_2nd == None or isinstance(url_2nd, float):  # if target url is not available
        print "NANNNNNNN"
        url_cmp_review = ""
    else:
        url_1stpart = re.search("from=", url_2nd)
        url_cmp_review = "%s%s" % (url_2nd[:url_1stpart.start()-1], "/reviews")

    #print target_comp_name
    if url_cmp_review != "":
        target_for_review = Soup(urllib.urlopen(url_cmp_review), "lxml")

        comp_rating_overall = target_for_review.find("span", {"class": "cmp-star-large-on"}).attrs['style']
        comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
        comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
        comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)

        targetElements = target_for_review.find('div', attrs={'class' : 'cmp-review'})

        review_summary = targetElements.find('div', attrs={'class':'cmp-review-title'})
        if review_summary != None:
            review_summary = review_summary.getText().strip()
        else: review_summary = ""

        review_pros = targetElements.find('div', attrs={'class':'cmp-review-pro-text'})
        if review_pros != None:
            review_pros = review_pros.getText().strip()
        else: review_pros = ""

        review_cons = targetElements.find('div', attrs={'class':'cmp-review-con-text'})
        if review_cons != None:
            review_cons = review_cons.getText().strip()
        else: review_cons = ""

        review_vote_count = targetElements.find('span', attrs={'class':'cmp-vote-count'})
        if review_vote_count.getText().strip() == "":
            review_vote_count = "0"
        else: review_vote_count = review_vote_count.getText().strip()

        if target_comp_name not in comp_set: # if the company is not duplicate,
            comp_set.add(target_comp_name)   # add to the set  & data frame.
            df_for_review = df_for_review.append({
                    'comp_name': target_comp_name, 'comp_review_link': url_cmp_review,
                    'review_summary': review_summary, 'review_pros': review_pros,
                    'review_cons': review_cons, 'review_votes': review_vote_count,
                    'comp_rating_overall': comp_rating_overall
                    }, ignore_index=True)

            print "%d | %s | %s" % (i, len(df_for_review), df_for_review.iloc[len(df_for_review)-1]['comp_name'])

        else:
            print "-Duplicates!!!!", target_comp_name, " at ", i
	# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com
	# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv')
	df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv')

	df_for_review = pd.DataFrame() # new data frame
	comp_set = set() # for duplicate check

	for i in range(0,len(df_received_for_review)):
	target_comp_name = df_received_for_review.iloc[i]['comp_name']

	url_2nd = df_received_for_review.iloc[i]['overall_link']

	if url_2nd == None or isinstance(url_2nd, float): # if target url is not available
	print "NANNNNNNN"
	url_cmp_review = ""
	else:
	url_1stpart = re.search("from=", url_2nd)
	url_cmp_review = "%s%s" % (url_2nd[:url_1stpart.start()-1], "/reviews")

	#print target_comp_name
	if url_cmp_review != "":
	target_for_review = Soup(urllib.urlopen(url_cmp_review), "lxml")

	comp_rating_overall = target_for_review.find("span", {"class": "cmp-star-large-on"}).attrs['style']
	comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
	comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
	comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)

	targetElements = target_for_review.find('div', attrs={'class' : 'cmp-review'})

	review_summary = targetElements.find('div', attrs={'class':'cmp-review-title'})
	if review_summary != None:
	review_summary = review_summary.getText().strip()
	else: review_summary = ""

	review_pros = targetElements.find('div', attrs={'class':'cmp-review-pro-text'})
	if review_pros != None:
	review_pros = review_pros.getText().strip()
	else: review_pros = ""

	review_cons = targetElements.find('div', attrs={'class':'cmp-review-con-text'})
	if review_cons != None:
	review_cons = review_cons.getText().strip()
	else: review_cons = ""

	review_vote_count = targetElements.find('span', attrs={'class':'cmp-vote-count'})
	if review_vote_count.getText().strip() == "":
	review_vote_count = "0"
	else: review_vote_count = review_vote_count.getText().strip()

	if target_comp_name not in comp_set: # if the company is not duplicate,
	comp_set.add(target_comp_name) # add to the set & data frame.
	df_for_review = df_for_review.append({
	'comp_name': target_comp_name, 'comp_review_link': url_cmp_review,
	'review_summary': review_summary, 'review_pros': review_pros,
	'review_cons': review_cons, 'review_votes': review_vote_count,
	'comp_rating_overall': comp_rating_overall
	}, ignore_index=True)

	print "%d \| %s \| %s" % (i, len(df_for_review), df_for_review.iloc[len(df_for_review)-1]['comp_name'])

	else:
	print "-Duplicates!!!!", target_comp_name, " at ", i