Skip to content

Instantly share code, notes, and snippets.

@nimish-gupta
Last active June 19, 2020 07:57
Show Gist options
  • Save nimish-gupta/6f66b354032f0ca693a4a61cc45eb9c6 to your computer and use it in GitHub Desktop.
Save nimish-gupta/6f66b354032f0ca693a4a61cc45eb9c6 to your computer and use it in GitHub Desktop.
Scrape jobs from hacker news
// SITE - https://www.workatastartup.com
const scroll = () => {
const scrollingElement = document.scrollingElement || document.body;
scrollingElement.scrollTop = scrollingElement.scrollHeight;
};
const sleep = () => new Promise((resolve) => setTimeout(resolve, 2 * 1000));
const getOpenings = (elem) => [...elem.querySelectorAll('.jobs-list>.job>.job-line>.job-details')]
.map(a => {
const [name, ...detail] = a.innerText.split('\n');
return { name, detail: detail.join('- ') }
})
const removeNewLine = (text, delimiter = ' ') =>
text.split('\n').join(delimiter);
const getEmptyOrValue = (node, selector, prop = 'innerText') => {
const elem = node.querySelector(selector);
if (elem) {
return removeNewLine(elem[prop]);
}
return '';
};
const getJdLink = (slug) => `https://www.workatastartup.com${slug}`;
const getCompanyInfo = (elem) => {
const metaCompanyTitle = getEmptyOrValue(elem, '.company-title');
const companyBatch = metaCompanyTitle.match(/\(([^)]+)\)/)[1];
const companyName = metaCompanyTitle.replace(`(${companyBatch})`, '');
const openings = getOpenings(elem);
const desc = getEmptyOrValue(elem, '.company-tech-desc');
const link = getEmptyOrValue(elem, '.company-detail a');
const jdLink = getJdLink(
elem.querySelector('.company-title a').getAttribute('href')
);
const size = getEmptyOrValue(elem, '.company-detail').match(/\d+/)[0];
return openings.map(opening => ({
"YC Batch": companyBatch,
"Company Name": companyName,
"Opening": opening.name,
"Job Detail": opening.detail,
"Desc": desc,
"Link": link,
"JD Link": jdLink,
"Size": size,
}))
};
const companiesKeysOrder = {
"YC Batch": 2,
"Company Name": 1,
"Opening": 7,
"Desc": 3,
"Link": 4,
"JD Link": 5,
"Size": 6,
};
const formatCompanies = (companies) => {
if (companies.length === 0) {
return companies;
}
const delimiter = '\t';
const headerKeys = Object.keys(companies[0]);
headerKeys.sort((a, b) => companiesKeysOrder[a] - companiesKeysOrder[b]);
const header = headerKeys.join(delimiter);
const strCompanies = companies
.map((company) => headerKeys.map((key) => company[key]).join(delimiter))
.join('\n');
return `${header}\n${strCompanies}`;
};
const scrape = async () => {
const total = parseInt(
document.querySelector('.directory-meta').innerText.match(/\d+/)[0]
);
let foundElements = document.querySelectorAll('.list-wrapper').length;
while (foundElements < total) {
console.log({ foundElements, total });
scroll();
await sleep();
foundElements = document.querySelectorAll('.list-wrapper').length;
}
const elements = document.querySelectorAll('.list-wrapper');
let companies = [];
elements.forEach((elem) => {
try {
companies = companies.concat(getCompanyInfo(elem));
} catch (error) {
console.log(elem);
console.error(error);
}
});
// const formattedCompanies = formatCompanies(companies);
console.log(companies)
return companies;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment