Last active
April 1, 2016 01:15
-
-
Save monspo1/aca64ccc0dddcf622e44 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com | |
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv') | |
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv') | |
df_for_review = pd.DataFrame() # new data frame | |
comp_set = set() # for duplicate check | |
for i in range(0,len(df_received_for_review)): | |
target_comp_name = df_received_for_review.iloc[i]['comp_name'] | |
url_2nd = df_received_for_review.iloc[i]['overall_link'] | |
if url_2nd == None or isinstance(url_2nd, float): # if target url is not available | |
print "NANNNNNNN" | |
url_cmp_review = "" | |
else: | |
url_1stpart = re.search("from=", url_2nd) | |
url_cmp_review = "%s%s" % (url_2nd[:url_1stpart.start()-1], "/reviews") | |
#print target_comp_name | |
if url_cmp_review != "": | |
target_for_review = Soup(urllib.urlopen(url_cmp_review), "lxml") | |
comp_rating_overall = target_for_review.find("span", {"class": "cmp-star-large-on"}).attrs['style'] | |
comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall) | |
comp_rating_overall = re.sub('[px;]', '', comp_rating_overall) | |
comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1) | |
targetElements = target_for_review.find('div', attrs={'class' : 'cmp-review'}) | |
review_summary = targetElements.find('div', attrs={'class':'cmp-review-title'}) | |
if review_summary != None: | |
review_summary = review_summary.getText().strip() | |
else: review_summary = "" | |
review_pros = targetElements.find('div', attrs={'class':'cmp-review-pro-text'}) | |
if review_pros != None: | |
review_pros = review_pros.getText().strip() | |
else: review_pros = "" | |
review_cons = targetElements.find('div', attrs={'class':'cmp-review-con-text'}) | |
if review_cons != None: | |
review_cons = review_cons.getText().strip() | |
else: review_cons = "" | |
review_vote_count = targetElements.find('span', attrs={'class':'cmp-vote-count'}) | |
if review_vote_count.getText().strip() == "": | |
review_vote_count = "0" | |
else: review_vote_count = review_vote_count.getText().strip() | |
if target_comp_name not in comp_set: # if the company is not duplicate, | |
comp_set.add(target_comp_name) # add to the set & data frame. | |
df_for_review = df_for_review.append({ | |
'comp_name': target_comp_name, 'comp_review_link': url_cmp_review, | |
'review_summary': review_summary, 'review_pros': review_pros, | |
'review_cons': review_cons, 'review_votes': review_vote_count, | |
'comp_rating_overall': comp_rating_overall | |
}, ignore_index=True) | |
print "%d | %s | %s" % (i, len(df_for_review), df_for_review.iloc[len(df_for_review)-1]['comp_name']) | |
else: | |
print "-Duplicates!!!!", target_comp_name, " at ", i | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment