Skip to content

Instantly share code, notes, and snippets.

@codesaler
Forked from 0xh3x/crawl.py
Created October 14, 2012 14:07
Show Gist options
  • Save codesaler/3888682 to your computer and use it in GitHub Desktop.
Save codesaler/3888682 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
import urllib2
from BeautifulSoup import BeautifulSoup
class Crawler(object):
def __init__(self):
self.job_list = []
def fetch_job_list(self):
print "Fetching job list"
url_handle = urllib2.urlopen('http://jobs.ge/?view=jobs')
soup = BeautifulSoup(url_handle)
links = soup.findAll('a', href=re.compile("\/\d+\/"), attrs={"class":"ls"})
for link in links:
job_id = int(link.attrMap['href'][1:-1])
self.job_list.append(job_id)
print "Done fetched:", len(self.job_list)
def main():
c = Crawler()
c.fetch_job_list()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment