Skip to content

Instantly share code, notes, and snippets.

@pakkinlau
Last active October 28, 2023 17:11
Show Gist options
  • Save pakkinlau/467d0e5a676a8759039d11a0e95f3327 to your computer and use it in GitHub Desktop.
Save pakkinlau/467d0e5a676a8759039d11a0e95f3327 to your computer and use it in GitHub Desktop.
A python script that scrape the job information from JOBSDB, a popular recruiter website in Hong Kong. It also tries to count mentioned skillsets and then make simple statistics for the data collected.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import re\n",
"import pandas as pd\n",
"import csv \n",
"from requests.exceptions import ReadTimeout\n",
"import time\n",
"\n",
"global job_title_list\n",
"job_title_list=[]\n",
"global company_list\n",
"company_list=[]\n",
"global yr_exp_list\n",
"yr_exp_list=[]\n",
"global level_list\n",
"level_list=[]\n",
"global role_list\n",
"role_list=[]\n",
"role=[]\n",
"global skill_list\n",
"skill_list=[]\n",
"\n",
"def retry_get(url, headers=None, retries=3, timeout=10):\n",
" \"\"\"\n",
" Retry HTTP GET request with timeout handling.\n",
"\n",
" Args:\n",
" url (str): The URL to request.\n",
" headers (dict): Optional HTTP headers.\n",
" retries (int): Number of retries on timeout (default is 3).\n",
" timeout (int): Timeout for the request in seconds (default is 10).\n",
"\n",
" Returns:\n",
" requests.Response: The response object if the request is successful.\n",
" \"\"\"\n",
" while retries > 0:\n",
" try:\n",
" response = requests.get(url, headers=headers, timeout=timeout)\n",
" response.raise_for_status() # Raise an exception for 4xx and 5xx status codes\n",
" return response\n",
" except (ReadTimeout, requests.exceptions.RequestException) as e:\n",
" print(f\"Timeout Error: Retrying... ({retries} retries left)\")\n",
" retries -= 1\n",
" time.sleep(2)\n",
" continue\n",
"\n",
" raise Exception(\"Max retries reached. Unable to make the request.\")\n",
"\n",
"def get_job_id(job_name, skills):\n",
" t1=time.time()\n",
" baseurl='https://hk.jobsdb.com/hk/search-jobs/'+re.sub(' ','- ',job_name)+'/1'\n",
" r=retry_get(baseurl,headers={\"User-Agent\":\"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1\"}).text\n",
" soup=BeautifulSoup(r,'lxml')\n",
" \n",
" pattern=re.compile(r'(\\d+)')\n",
" total_num_jobs=re.findall('\\d+', soup.find('meta', property=\"og:description\")['content'])\n",
" \n",
" float(total_num_jobs[0]) #actually only one value\n",
" num_page=int(total_num_jobs[0]) //30 +1 #if really no need? like this case\n",
" \n",
" global job_id \n",
" job_id=[]\n",
"\n",
"\n",
" skill_lower=[i.lower() for i in skills]\n",
" skill_set=list(set(skill_lower))\n",
" skill_set\n",
" \n",
" \n",
" for i in range(1,num_page+1):\n",
" #time.sleep(1)\n",
" print(f'Now start to craw job search result P.{i}')\n",
" newurl=baseurl[:-1]+str(i)\n",
" r=retry_get(newurl,headers={\"User-Agent\":\"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1\"}).text\n",
" soup=BeautifulSoup(r,'lxml')\n",
" \n",
" script_str=soup.find_all(\"script\")[1].get_text()[28:]\n",
" pattern=re.compile(r'(https\\W\\Wu002F\\Wu002Fhk\\Wjobsdb\\Wcom\\Wu002Fhk\\Wu002Fen\\Wu002Fjob\\Wu002F)([a-zA-Z0-9]+\\W)*') \n",
" job_url_raw=[]\n",
" job_url_raw2=[]\n",
" for i in pattern.finditer(script_str):\n",
" job_url_raw.append(i[0])\n",
" for i in job_url_raw:\n",
" a=re.sub(\"u002F\",'',i)\n",
" b=re.sub('\"','',a)\n",
" c=b.split('\\\\')[-1]\n",
" print(c)\n",
" job_id.append(c)\n",
" \n",
" for i in job_id:\n",
" try: #have empty id?\n",
" print(f'Now start to craw individual job id {i}')\n",
" \n",
" joburl='https://hk.jobsdb.com/hk/en/job/'+str(i)\n",
" r=retry_get(joburl,headers={\"User-Agent\":\"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1\"}).text\n",
" soup=BeautifulSoup(r,'lxml')\n",
" try:\n",
" job_title= soup.find(class_='general-pos ad-y-auto-txt2').text.strip()\n",
" except:\n",
" job_title= soup.find(class_='general-pos ad-y-auto-txt').text.strip()\n",
" print(f'Job title is {job_title}')\n",
" try:\n",
" company= soup.find(class_='jobad-header-company').text.lstrip()\n",
" except:\n",
" company= soup.find(class_='jobad-header-company ad-y-auto-txt1').text.lstrip()\n",
" print(f'company {job_title}')\n",
" role=[i.text.lower() for i in soup.find_all('span')]\n",
"\n",
" role2=str(role)\n",
" \n",
" print(f' Got job role for this post {job_title}')\n",
" try:\n",
" yr_exp=soup.find('b', class_='primary-meta-exp').text.strip()\n",
" yr_exp=yr_exp[0]\n",
" except:\n",
" yr_exp='Didnt specify'\n",
" print(f' Exp is {yr_exp} ')\n",
" try:\n",
" level=soup.find('b', class_='primary-meta-lv').text.strip()\n",
" except:\n",
" level='Didnt specify'\n",
" print(f' level is {yr_exp} ')\n",
" job_title_list.append(job_title) \n",
" company_list.append(company) \n",
" yr_exp_list.append(yr_exp)\n",
" role_list.append(role)\n",
" \n",
" print('now to matching')\n",
" required_skill=[]\n",
" for i in skill_set:\n",
" result=re.search(i,role2)\n",
" if result:\n",
" required_skill.append(i)\n",
" skill_list.append(i)\n",
" else:\n",
" pass\n",
" \n",
" with open ('jobsDBtest_csv_result.csv','a') as f:\n",
" csv_writer= csv.writer(f)\n",
" csv_writer.writerow([i,job_title,company,yr_exp,level,required_skill]) \n",
"\n",
" except:\n",
" print('Cant crawd some id')\n",
" pass\n",
" t2= time.time()\n",
" print(f'It takes {t2-t1} to crawd all the jobs info')\n",
" print(f'There are total {len(job_id)} jobs in this category')\n",
" frequency={}\n",
" for item in skill_list:\n",
" frequency[item]=frequency.get(item,0)+1\n",
" skill_count=pd.Series(frequency)\n",
" skill_count.sort_values(ascending=False,inplace=True)\n",
" \n",
" return skill_count"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Now start to craw job search result P.1\n",
"Now start to craw job search result P.2\n",
"Now start to craw job search result P.3\n",
"Now start to craw job search result P.4\n",
"Now start to craw job search result P.5\n",
"Now start to craw job search result P.6\n",
"Now start to craw job search result P.7\n",
"Now start to craw job search result P.8\n",
"Now start to craw job search result P.9\n",
"\n",
"machine-learning-engineer-consultant-manager-level-110000000444950?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=241&jobId=jobsdb-hk-job-110000000444950\n",
"computer-vision-algorithm-engineer-110000000446429?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=242&jobId=jobsdb-hk-job-110000000446429\n",
"data-engineer-110000000447437?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=243&jobId=jobsdb-hk-job-110000000447437\n",
"solution-architect-ai-and-industrial-110000000445770?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=244&jobId=jobsdb-hk-job-110000000445770\n",
"solution-architect-ai-and-industrial-110000000445710?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=245&jobId=jobsdb-hk-job-110000000445710\n",
"ai-researcher-110000000444038?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=246&jobId=jobsdb-hk-job-110000000444038\n",
"postdoctoral-fellow-110000000447542?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=247&jobId=jobsdb-hk-job-110000000447542\n",
"service-system-digital-solutions-specialist-chatbot-development-110000000447117?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=248&jobId=jobsdb-hk-job-110000000447117\n",
"data-engineering-lead-analyst-110000000444507?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=249&jobId=jobsdb-hk-job-110000000444507\n",
"assistant-computing-officer-110000000443579?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=250&jobId=jobsdb-hk-job-110000000443579\n",
"senior-manager-artificial-intelligence-and-data-science-110000000443709?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=251&jobId=jobsdb-hk-job-110000000443709\n",
"r-d-data-scientist-full-time-110000000443895?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=252&jobId=jobsdb-hk-job-110000000443895\n",
"senior-manager-data-strategy-governance-110000000443203?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=253&jobId=jobsdb-hk-job-110000000443203\n",
"senior-data-engineer-110000000441950?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=254&jobId=jobsdb-hk-job-110000000441950\n",
"asia-pacific-innovation-and-generative-ai-solutions-director-risk-and-compliance-110000000446567?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=255&jobId=jobsdb-hk-job-110000000446567\n",
"research-assistant-110000000444099?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=256&jobId=jobsdb-hk-job-110000000444099\n",
"senior-execution-manager-innovation-and-generative-ai-solutions-global-functions-110000000447495?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=257&jobId=jobsdb-hk-job-110000000447495\n",
"software-engineer-iii-full-stack-110000000444431?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=258&jobId=jobsdb-hk-job-110000000444431\n",
"research-associate-research-assistant-junior-research-assistant-110000000443598?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=259&jobId=jobsdb-hk-job-110000000443598\n",
"web-developer-110000000446113?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=260&jobId=jobsdb-hk-job-110000000446113\n",
"postdoctoral-fellow-research-associate-research-assistant-110000000445684?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=261&jobId=jobsdb-hk-job-110000000445684\n",
"engineer-radio-network-analysis-110000000442909?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=262&jobId=jobsdb-hk-job-110000000442909\n",
"postdoctoral-fellow-s-research-associate-s-110000000445344?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=263&jobId=jobsdb-hk-job-110000000445344\n",
"credit-risk-manager-modelling-110000000443212?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=264&jobId=jobsdb-hk-job-110000000443212\n",
"Now start to craw individual job id \n",
"Timeout Error: Retrying... (3 retries left)\n",
"Timeout Error: Retrying... (2 retries left)\n",
"Timeout Error: Retrying... (1 retries left)\n",
"Cant crawd some id\n",
"Now start to craw individual job id machine-learning-engineer-consultant-manager-level-110000000444950?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=241&jobId=jobsdb-hk-job-110000000444950\n",
"Cant crawd some id\n",
"Now start to craw individual job id computer-vision-algorithm-engineer-110000000446429?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=242&jobId=jobsdb-hk-job-110000000446429\n",
"Cant crawd some id\n",
"Now start to craw individual job id data-engineer-110000000447437?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=243&jobId=jobsdb-hk-job-110000000447437\n",
"Cant crawd some id\n",
"Now start to craw individual job id solution-architect-ai-and-industrial-110000000445770?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=244&jobId=jobsdb-hk-job-110000000445770\n",
"Cant crawd some id\n",
"Now start to craw individual job id solution-architect-ai-and-industrial-110000000445710?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=245&jobId=jobsdb-hk-job-110000000445710\n",
"Cant crawd some id\n",
"Now start to craw individual job id ai-researcher-110000000444038?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=246&jobId=jobsdb-hk-job-110000000444038\n",
"Cant crawd some id\n",
"Now start to craw individual job id postdoctoral-fellow-110000000447542?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=247&jobId=jobsdb-hk-job-110000000447542\n",
"Cant crawd some id\n",
"Now start to craw individual job id service-system-digital-solutions-specialist-chatbot-development-110000000447117?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=248&jobId=jobsdb-hk-job-110000000447117\n",
"Cant crawd some id\n",
"Now start to craw individual job id data-engineering-lead-analyst-110000000444507?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=249&jobId=jobsdb-hk-job-110000000444507\n",
"Cant crawd some id\n",
"Now start to craw individual job id assistant-computing-officer-110000000443579?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=250&jobId=jobsdb-hk-job-110000000443579\n",
"Cant crawd some id\n",
"Now start to craw individual job id senior-manager-artificial-intelligence-and-data-science-110000000443709?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=251&jobId=jobsdb-hk-job-110000000443709\n",
"Cant crawd some id\n",
"Now start to craw individual job id r-d-data-scientist-full-time-110000000443895?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=252&jobId=jobsdb-hk-job-110000000443895\n",
"Cant crawd some id\n",
"Now start to craw individual job id senior-manager-data-strategy-governance-110000000443203?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=253&jobId=jobsdb-hk-job-110000000443203\n",
"Cant crawd some id\n",
"Now start to craw individual job id senior-data-engineer-110000000441950?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=254&jobId=jobsdb-hk-job-110000000441950\n",
"Cant crawd some id\n",
"Now start to craw individual job id asia-pacific-innovation-and-generative-ai-solutions-director-risk-and-compliance-110000000446567?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=255&jobId=jobsdb-hk-job-110000000446567\n",
"Cant crawd some id\n",
"Now start to craw individual job id research-assistant-110000000444099?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=256&jobId=jobsdb-hk-job-110000000444099\n",
"Cant crawd some id\n",
"Now start to craw individual job id senior-execution-manager-innovation-and-generative-ai-solutions-global-functions-110000000447495?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=257&jobId=jobsdb-hk-job-110000000447495\n",
"Cant crawd some id\n",
"Now start to craw individual job id software-engineer-iii-full-stack-110000000444431?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=258&jobId=jobsdb-hk-job-110000000444431\n",
"Cant crawd some id\n",
"Now start to craw individual job id research-associate-research-assistant-junior-research-assistant-110000000443598?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=259&jobId=jobsdb-hk-job-110000000443598\n",
"Cant crawd some id\n",
"Now start to craw individual job id web-developer-110000000446113?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=260&jobId=jobsdb-hk-job-110000000446113\n",
"Cant crawd some id\n",
"Now start to craw individual job id postdoctoral-fellow-research-associate-research-assistant-110000000445684?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=261&jobId=jobsdb-hk-job-110000000445684\n",
"Cant crawd some id\n",
"Now start to craw individual job id engineer-radio-network-analysis-110000000442909?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=262&jobId=jobsdb-hk-job-110000000442909\n",
"Cant crawd some id\n",
"Now start to craw individual job id postdoctoral-fellow-s-research-associate-s-110000000445344?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=263&jobId=jobsdb-hk-job-110000000445344\n",
"Cant crawd some id\n",
"Now start to craw individual job id credit-risk-manager-modelling-110000000443212?token=0~0159c085-bb00-48f1-83f8-f2726a9b9600&sectionRank=264&jobId=jobsdb-hk-job-110000000443212\n",
"Cant crawd some id\n",
"It takes 37.525963306427 to crawd all the jobs info\n",
"There are total 25 jobs in this category\n"
]
}
],
"source": [
"skills = ['R', 'RStudio', 'Markdown', 'Latex', 'SparkR', 'D3', 'D3.js','Microsoft Office','Excel',\n",
"'Unix', 'Linux', 'MySQL', 'Microsoft SQL server', 'SQL','VBA','Qlik'\n",
"'Python', 'SPSS', 'SAS', 'C#','Matlab','Java', 'keras',\n",
"'JavaScript', 'HTML', 'HTML5', 'CSS', 'CSS3','PHP', 'Excel', 'Tableau',\n",
"'AWS', 'Amazon Web Services ','Google Cloud Platform', 'GCP','theano'\n",
"'Microsoft Azure', 'Azure', 'Hadoop', 'Spark','python'\n",
"'MapReduce', 'Map Reduce','Shark', 'Cassandra',\n",
"'NoSQL', 'MongoDB', 'GIS', 'Haskell', 'Scala', 'Ruby','Perl',\n",
"'Mahout', 'Stata','Deep Learning','Machine Learning', 'Pytorch', \"Tensorflow\",'Caffe','API','seo','Business Intelligence'\n",
", 'BI', ]\n",
"\n",
"A = get_job_id('machine learning engineer', skills)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment