Last active
June 19, 2020 07:57
-
-
Save nimish-gupta/6f66b354032f0ca693a4a61cc45eb9c6 to your computer and use it in GitHub Desktop.
Scrape jobs from hacker news
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SITE - https://www.workatastartup.com | |
const scroll = () => { | |
const scrollingElement = document.scrollingElement || document.body; | |
scrollingElement.scrollTop = scrollingElement.scrollHeight; | |
}; | |
const sleep = () => new Promise((resolve) => setTimeout(resolve, 2 * 1000)); | |
const getOpenings = (elem) => [...elem.querySelectorAll('.jobs-list>.job>.job-line>.job-details')] | |
.map(a => { | |
const [name, ...detail] = a.innerText.split('\n'); | |
return { name, detail: detail.join('- ') } | |
}) | |
const removeNewLine = (text, delimiter = ' ') => | |
text.split('\n').join(delimiter); | |
const getEmptyOrValue = (node, selector, prop = 'innerText') => { | |
const elem = node.querySelector(selector); | |
if (elem) { | |
return removeNewLine(elem[prop]); | |
} | |
return ''; | |
}; | |
const getJdLink = (slug) => `https://www.workatastartup.com${slug}`; | |
const getCompanyInfo = (elem) => { | |
const metaCompanyTitle = getEmptyOrValue(elem, '.company-title'); | |
const companyBatch = metaCompanyTitle.match(/\(([^)]+)\)/)[1]; | |
const companyName = metaCompanyTitle.replace(`(${companyBatch})`, ''); | |
const openings = getOpenings(elem); | |
const desc = getEmptyOrValue(elem, '.company-tech-desc'); | |
const link = getEmptyOrValue(elem, '.company-detail a'); | |
const jdLink = getJdLink( | |
elem.querySelector('.company-title a').getAttribute('href') | |
); | |
const size = getEmptyOrValue(elem, '.company-detail').match(/\d+/)[0]; | |
return openings.map(opening => ({ | |
"YC Batch": companyBatch, | |
"Company Name": companyName, | |
"Opening": opening.name, | |
"Job Detail": opening.detail, | |
"Desc": desc, | |
"Link": link, | |
"JD Link": jdLink, | |
"Size": size, | |
})) | |
}; | |
const companiesKeysOrder = { | |
"YC Batch": 2, | |
"Company Name": 1, | |
"Opening": 7, | |
"Desc": 3, | |
"Link": 4, | |
"JD Link": 5, | |
"Size": 6, | |
}; | |
const formatCompanies = (companies) => { | |
if (companies.length === 0) { | |
return companies; | |
} | |
const delimiter = '\t'; | |
const headerKeys = Object.keys(companies[0]); | |
headerKeys.sort((a, b) => companiesKeysOrder[a] - companiesKeysOrder[b]); | |
const header = headerKeys.join(delimiter); | |
const strCompanies = companies | |
.map((company) => headerKeys.map((key) => company[key]).join(delimiter)) | |
.join('\n'); | |
return `${header}\n${strCompanies}`; | |
}; | |
const scrape = async () => { | |
const total = parseInt( | |
document.querySelector('.directory-meta').innerText.match(/\d+/)[0] | |
); | |
let foundElements = document.querySelectorAll('.list-wrapper').length; | |
while (foundElements < total) { | |
console.log({ foundElements, total }); | |
scroll(); | |
await sleep(); | |
foundElements = document.querySelectorAll('.list-wrapper').length; | |
} | |
const elements = document.querySelectorAll('.list-wrapper'); | |
let companies = []; | |
elements.forEach((elem) => { | |
try { | |
companies = companies.concat(getCompanyInfo(elem)); | |
} catch (error) { | |
console.log(elem); | |
console.error(error); | |
} | |
}); | |
// const formattedCompanies = formatCompanies(companies); | |
console.log(companies) | |
return companies; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment