Skip to content

Instantly share code, notes, and snippets.

@lucasheriques
Created June 18, 2025 20:34
Show Gist options
  • Select an option

  • Save lucasheriques/20b6d26fa3179ab068e356e84b03f4d0 to your computer and use it in GitHub Desktop.

Select an option

Save lucasheriques/20b6d26fa3179ab068e356e84b03f4d0 to your computer and use it in GitHub Desktop.
Basic job scraper
#!/usr/bin/env tsx
import { JSDOM } from 'jsdom';
interface JobListing {
title: string;
company: string;
location: string;
url: string;
description?: string;
posted?: string;
}
interface CompanyConfig {
name: string;
url: string;
type: 'greenhouse' | 'ashby' | 'lever' | 'direct';
}
// Test companies from your list
const testCompanies: CompanyConfig[] = [
{
name: 'Stripe',
url: 'https://boards.greenhouse.io/embed/job_board?for=Stripe',
type: 'greenhouse'
},
{
name: 'PostHog',
url: 'https://jobs.ashbyhq.com/posthog',
type: 'ashby'
},
{
name: 'Spotify',
url: 'https://jobs.lever.co/spotify',
type: 'lever'
}
];
async function fetchHTML(url: string): Promise<string> {
try {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return await response.text();
} catch (error) {
console.error(`Error fetching ${url}:`, error);
return '';
}
}
function scrapeGreenhouse(html: string, companyName: string): JobListing[] {
const dom = new JSDOM(html);
const document = dom.window.document;
const jobs: JobListing[] = [];
// Greenhouse typically uses .opening class for job listings
const jobElements = document.querySelectorAll('.opening, .job, [data-qa="opening"]');
jobElements.forEach(element => {
try {
const titleElement = element.querySelector('a, .opening-title, h3, h4');
const locationElement = element.querySelector('.location, .opening-location, [data-qa="location"]');
const linkElement = element.querySelector('a');
if (titleElement && linkElement) {
const title = titleElement.textContent?.trim() || '';
const location = locationElement?.textContent?.trim() || 'Remote';
const href = linkElement.getAttribute('href') || '';
const fullUrl = href.startsWith('http') ? href : `https://boards.greenhouse.io${href}`;
if (title) {
jobs.push({
title,
company: companyName,
location,
url: fullUrl
});
}
}
} catch (error) {
console.error('Error parsing job element:', error);
}
});
return jobs;
}
function scrapeAshby(html: string, companyName: string): JobListing[] {
const dom = new JSDOM(html);
const document = dom.window.document;
const jobs: JobListing[] = [];
// Ashby typically uses different selectors
const jobElements = document.querySelectorAll('[data-testid="job-posting"], .job-posting, .ashby-job-posting');
jobElements.forEach(element => {
try {
const titleElement = element.querySelector('h3, h4, .job-title, [data-testid="job-title"]');
const locationElement = element.querySelector('.location, [data-testid="location"]');
const linkElement = element.querySelector('a');
if (titleElement && linkElement) {
const title = titleElement.textContent?.trim() || '';
const location = locationElement?.textContent?.trim() || 'Remote';
const href = linkElement.getAttribute('href') || '';
const fullUrl = href.startsWith('http') ? href : `https://jobs.ashbyhq.com${href}`;
if (title) {
jobs.push({
title,
company: companyName,
location,
url: fullUrl
});
}
}
} catch (error) {
console.error('Error parsing job element:', error);
}
});
return jobs;
}
function scrapeLever(html: string, companyName: string): JobListing[] {
const dom = new JSDOM(html);
const document = dom.window.document;
const jobs: JobListing[] = [];
// Lever typically uses .posting class
const jobElements = document.querySelectorAll('.posting, .lever-posting');
jobElements.forEach(element => {
try {
const titleElement = element.querySelector('h5, .posting-title, .lever-posting-title');
const locationElement = element.querySelector('.location, .posting-location');
const linkElement = element.querySelector('a');
if (titleElement && linkElement) {
const title = titleElement.textContent?.trim() || '';
const location = locationElement?.textContent?.trim() || 'Remote';
const href = linkElement.getAttribute('href') || '';
const fullUrl = href.startsWith('http') ? href : `https://jobs.lever.co${href}`;
if (title) {
jobs.push({
title,
company: companyName,
location,
url: fullUrl
});
}
}
} catch (error) {
console.error('Error parsing job element:', error);
}
});
return jobs;
}
async function scrapeCompany(company: CompanyConfig): Promise<JobListing[]> {
console.log(`\nπŸ” Scraping ${company.name} (${company.type})...`);
const html = await fetchHTML(company.url);
if (!html) {
console.log(`❌ Failed to fetch HTML for ${company.name}`);
return [];
}
let jobs: JobListing[] = [];
switch (company.type) {
case 'greenhouse':
jobs = scrapeGreenhouse(html, company.name);
break;
case 'ashby':
jobs = scrapeAshby(html, company.name);
break;
case 'lever':
jobs = scrapeLever(html, company.name);
break;
default:
console.log(`❓ Unknown scraper type: ${company.type}`);
return [];
}
console.log(`βœ… Found ${jobs.length} jobs for ${company.name}`);
// Log first few jobs as examples
jobs.slice(0, 3).forEach(job => {
console.log(` πŸ“‹ ${job.title} - ${job.location}`);
});
return jobs;
}
async function main() {
console.log('πŸš€ Starting job scraper test...');
const allJobs: JobListing[] = [];
for (const company of testCompanies) {
const jobs = await scrapeCompany(company);
allJobs.push(...jobs);
// Be respectful - wait between requests
await new Promise(resolve => setTimeout(resolve, 2000));
}
console.log(`\nπŸ“Š Total jobs found: ${allJobs.length}`);
// Group by company
const jobsByCompany = allJobs.reduce((acc, job) => {
if (!acc[job.company]) acc[job.company] = [];
acc[job.company].push(job);
return acc;
}, {} as Record<string, JobListing[]>);
console.log('\nπŸ“ˆ Jobs by company:');
Object.entries(jobsByCompany).forEach(([company, jobs]) => {
console.log(` ${company}: ${jobs.length} jobs`);
});
// Save results to file for inspection
const fs = await import('fs/promises');
await fs.writeFile('job-scraper-results.json', JSON.stringify(allJobs, null, 2));
console.log('\nπŸ’Ύ Results saved to job-scraper-results.json');
}
if (require.main === module) {
main().catch(console.error);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment