Skip to content

Instantly share code, notes, and snippets.

@monspo1
Last active June 6, 2017 06:22
Show Gist options
  • Save monspo1/9bea6699fa86f773bb19 to your computer and use it in GitHub Desktop.
Save monspo1/9bea6699fa86f773bb19 to your computer and use it in GitHub Desktop.
for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
page = (page-1) * 10
url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url
target = Soup(urllib.urlopen(url), "lxml")
targetElements = target.findAll('div', attrs={'class' : ' row result'}) # we're interested in each row (= each job)
# trying to get each specific job information (such as company name, job title, urls, ...)
for elem in targetElements:
comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
job_posted = elem.find('span', attrs={'class': 'date'}).getText()
comp_link_overall = elem.find('span', attrs={'itemprop':'name'}).find('a')
if comp_link_overall != None: # if company link exists, access it. Otherwise, skip.
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
# add a job info to our data frame
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'job_posted': job_posted,
'overall_link': comp_link_overall, 'job_location': job_addr,
'overall_rating': None, 'wl_bal_rating': None,
'benefit_rating': None, 'jsecurity_rating': None,
'mgmt_rating': None, 'culture_rating': None
}, ignore_index=True)
df
@ddhar3
Copy link

ddhar3 commented Jun 6, 2017

Used the code to scrape Indeed, "print df" shows that the dataframe is empty.
Any help on this? Really hoped this code would work

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment