Skip to content

Instantly share code, notes, and snippets.

@monspo1
Last active April 1, 2016 01:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monspo1/aca64ccc0dddcf622e44 to your computer and use it in GitHub Desktop.
Save monspo1/aca64ccc0dddcf622e44 to your computer and use it in GitHub Desktop.
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv')
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv')
df_for_review = pd.DataFrame() # new data frame
comp_set = set() # for duplicate check
for i in range(0,len(df_received_for_review)):
target_comp_name = df_received_for_review.iloc[i]['comp_name']
url_2nd = df_received_for_review.iloc[i]['overall_link']
if url_2nd == None or isinstance(url_2nd, float): # if target url is not available
print "NANNNNNNN"
url_cmp_review = ""
else:
url_1stpart = re.search("from=", url_2nd)
url_cmp_review = "%s%s" % (url_2nd[:url_1stpart.start()-1], "/reviews")
#print target_comp_name
if url_cmp_review != "":
target_for_review = Soup(urllib.urlopen(url_cmp_review), "lxml")
comp_rating_overall = target_for_review.find("span", {"class": "cmp-star-large-on"}).attrs['style']
comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)
targetElements = target_for_review.find('div', attrs={'class' : 'cmp-review'})
review_summary = targetElements.find('div', attrs={'class':'cmp-review-title'})
if review_summary != None:
review_summary = review_summary.getText().strip()
else: review_summary = ""
review_pros = targetElements.find('div', attrs={'class':'cmp-review-pro-text'})
if review_pros != None:
review_pros = review_pros.getText().strip()
else: review_pros = ""
review_cons = targetElements.find('div', attrs={'class':'cmp-review-con-text'})
if review_cons != None:
review_cons = review_cons.getText().strip()
else: review_cons = ""
review_vote_count = targetElements.find('span', attrs={'class':'cmp-vote-count'})
if review_vote_count.getText().strip() == "":
review_vote_count = "0"
else: review_vote_count = review_vote_count.getText().strip()
if target_comp_name not in comp_set: # if the company is not duplicate,
comp_set.add(target_comp_name) # add to the set & data frame.
df_for_review = df_for_review.append({
'comp_name': target_comp_name, 'comp_review_link': url_cmp_review,
'review_summary': review_summary, 'review_pros': review_pros,
'review_cons': review_cons, 'review_votes': review_vote_count,
'comp_rating_overall': comp_rating_overall
}, ignore_index=True)
print "%d | %s | %s" % (i, len(df_for_review), df_for_review.iloc[len(df_for_review)-1]['comp_name'])
else:
print "-Duplicates!!!!", target_comp_name, " at ", i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment