Skip to content

Instantly share code, notes, and snippets.

@vrootic
Created July 29, 2017 00:21
Show Gist options
  • Save vrootic/c7b0579e960de9092a0265e7d6fe5e7a to your computer and use it in GitHub Desktop.
Save vrootic/c7b0579e960de9092a0265e7d6fe5e7a to your computer and use it in GitHub Desktop.
from math import ceil
import requests
from bs4 import BeautifulSoup
def get_soup(url, headers, payload):
total_pages = 0
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.text, 'html5lib')
return soup
def get_total_pages(url, header, payload):
soup = get_soup(url, headers, payload)
jobstr = soup.find('div', 'leftColumn').find('table', {'class': 'searchResultPages'}).find('td', {'class': 'td-result'}).text
jobnumbers = int(jobstr.rstrip().split(' ')[-1])
return ceil(jobnumbers / 10)
def list_jobs(soup):
posts = soup.find('div', 'leftColumn').findAll('div', {'class', 's-res'})
career_url = 'http://salesforce.careermount.com/career/'
for p in posts:
print(p.find('a').text)
print(career_url + p.find('a')['href'])
print(p.find('span').text)
print()
def url_cat(total_pages, url):
yield url
page = 1
while page <= total_pages:
yield url + str(page)
page += 1
if __name__ == '__main__':
payload = {
'location': 'California',
'keyword': 'software',
'sort_dir': 'desc',
'sort_field': 'post_date',
'relevance': 'false'
}
# The headers would expired sometime.
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Cookie': '__utmt=1; __utma=10312119.1119980742.1501264404.1501264404.1501264404.1; __utmb=10312119.1.10.1501264404; __utmc=10312119; __utmz=10312119.1501264404.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __atuvc=1%7C30; __atuvs=597b7a13cbbf5ab1000; JSESSIONID=D236CEC4A43668618F47703D2899E77F.node01; logged=""'
}
list_jobs(get_soup(url, headers, payload))
url = 'http://salesforce.careermount.com/candidate/job_search/quick/results/'
total_pages = get_total_pages(url, headers, payload)
url_gen = url_cat(total_pages, url)
for url in url_gen:
soup = get_soup(url, headers, payload)
list_jobs(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment