Created
August 28, 2013 06:48
-
-
Save onurdegerli/6362839 to your computer and use it in GitHub Desktop.
This code gets the job titles from http://jobsearch.about.com.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import MySQLdb | |
import urllib2 | |
from urllib2 import Request, urlopen, URLError, HTTPError | |
import re | |
regex = re.compile("<li>([\w &;/<>]*?)</li>") | |
db = MySQLdb.connect(host="", | |
user="", | |
passwd="", | |
db="") | |
cur = db.cursor() | |
cur.execute("SELECT * FROM job_title WHERE parent_job_title_id=0") | |
rows = cur.fetchall() | |
for row in rows: | |
parentJobTitleId = row[0] | |
parentJobTitleEn = row[2] | |
url = row[4] | |
print parentJobTitleId , '-' , parentJobTitleEn , '-', url | |
req = urllib2.Request(url) | |
try: | |
response = urlopen(req) | |
except HTTPError as e: | |
print 'The server couldn\'t fulfill the request.' | |
print 'Error code: ', e.code | |
except URLError as e: | |
print 'We failed to reach a server.' | |
print 'Reason: ', e.reason | |
else: | |
html = response.read() | |
# print html | |
r = regex.findall(html) | |
for jobTitleEn in r: | |
print jobTitleEn | |
cur.execute('''INSERT INTO job_title (parent_job_title_id, title_en, title_tr, url) | |
values (%s, %s, %s, %s)''', | |
(parentJobTitleId, jobTitleEn, '', '')) | |
# print cur._last_executed | |
db.commit() | |
db.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE IF NOT EXISTS `job_title` ( | |
`job_title_id` int(10) unsigned NOT NULL AUTO_INCREMENT, | |
`parent_job_title_id` int(10) unsigned NOT NULL DEFAULT '0', | |
`title_en` varchar(255) NOT NULL, | |
`title_tr` varchar(255) NOT NULL, | |
`url` varchar(255) NOT NULL, | |
PRIMARY KEY (`job_title_id`) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=3788 ; | |
-- | |
-- Dumping data for table `job_title` | |
-- | |
INSERT INTO `job_title` (`job_title_id`, `parent_job_title_id`, `title_en`, `title_tr`, `url`) VALUES | |
(1, 0, 'Advertising', '', 'http://jobsearch.about.com/od/job-titles/a/advertising-job-titles.htm'), | |
(2, 0, 'Accounting', '', 'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm'), | |
(3, 0, 'Construction', '', 'http://jobsearch.about.com/od/job-title-samples/a/construction-job-titles.htm'), | |
(4, 0, 'Administrative', '', 'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm'), | |
(5, 0, 'Business', '', 'http://jobsearch.about.com/od/job-title-samples/a/business-job-titles.htm'), | |
(6, 0, 'Engineering', '', 'http://jobsearch.about.com/od/job-title-samples/a/engineering-job-titles.htm'), | |
(7, 0, 'Corporate', '', 'http://jobsearch.about.com/od/job-title-samples/a/c-level-job-titles.htm'), | |
(8, 0, 'Entry Level', '', 'http://jobsearch.about.com/od/best-jobs/a/best-entry-level-jobs.htm'), | |
(9, 0, 'Hospitality', '', 'http://jobsearch.about.com/od/job-title-samples/a/hospitality-job-titles.htm'), | |
(10, 0, 'Health/Safety', '', 'http://jobsearch.about.com/od/job-title-samples/a/health-safety-job-titles.htm'), | |
(11, 0, 'First', '', 'http://jobsearch.about.com/od/justforstudents/a/first-job-list.htm'), | |
(12, 0, 'Social Media', '', 'http://jobsearch.about.com/od/job-title-samples/a/social-media-job-titles.htm'), | |
(13, 0, 'Real Estate', '', 'http://jobsearch.about.com/od/job-title-samples/a/real-estate-job-titles.htm'), | |
(14, 0, 'Health Care / Medical', '', 'http://jobsearch.about.com/od/job-title-samples/a/health-care-job-titles.htm'), | |
(15, 0, 'Insurance', '', 'http://jobsearch.about.com/od/job-title-samples/a/insurance-job-titles.htm'), | |
(16, 0, 'IT', '', 'http://jobsearch.about.com/od/job-title-samples/a/it-job-titles.htm'), | |
(17, 0, 'Legal', '', 'http://jobsearch.about.com/od/job-title-samples/a/legal-job-titles.htm'), | |
(18, 0, 'Maintenance', '', 'http://jobsearch.about.com/od/job-title-samples/a/maintenance-job-titles.htm'), | |
(19, 0, 'Chef', '', 'http://culinaryarts.about.com/od/culinaryfundamentals/a/whatisachef.htm'), | |
(20, 0, 'Manufacturing', '', 'http://jobsearch.about.com/od/job-title-samples/a/manufacturing-job-titles.htm'), | |
(21, 0, 'Event Planning', '', 'http://eventplanning.about.com/od/eventcareers/tp/corporateevents.htm'), | |
(22, 0, 'Media', '', 'http://jobsearch.about.com/od/job-title-samples/a/media-job-titles.htm'), | |
(23, 0, 'Non Profit', '', 'http://jobsearch.about.com/od/job-title-samples/a/nonprofit-job-titles.htm'), | |
(24, 0, 'Finance', '', 'http://financecareers.about.com/od/jobtitles/a/jobtitles.htm'), | |
(25, 0, 'Public Relations', '', 'http://jobsearch.about.com/od/job-title-samples/a/public-relations-job-titles.htm'), | |
(26, 0, 'Science', '', 'http://jobsearch.about.com/od/job-title-samples/a/science-job-titles.htm'), | |
(27, 0, 'Human Resources', '', 'http://humanresources.about.com/od/jobdescriptions/f/hr_job_mgr.htm'), | |
(28, 0, 'Geography', '', 'http://geography.about.com/od/careersingeography/a/jobsgeography.htm'), | |
(29, 0, 'Second', '', 'http://jobsearch.about.com/od/parttimejobs/a/best-second-jobs.htm'), | |
(30, 0, 'Social Work', '', 'http://jobsearch.about.com/od/job-title-samples/a/social-work-job-titles.htm'), | |
(31, 0, 'Transportation', '', 'http://jobsearch.about.com/od/job-title-samples/a/transportation-job-titles.htm'), | |
(32, 0, 'Psychology Related', '', 'http://psychology.about.com/od/careersinpsychology/a/career-list.htm'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
1- Create mysql table and paret job titles.
2- Run "get_job_titles.py" on command line.
Note: Do not forget to configure your mysql connection settings.