Created
June 18, 2025 20:34
-
-
Save lucasheriques/20b6d26fa3179ab068e356e84b03f4d0 to your computer and use it in GitHub Desktop.
Basic job scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env tsx | |
| import { JSDOM } from 'jsdom'; | |
| interface JobListing { | |
| title: string; | |
| company: string; | |
| location: string; | |
| url: string; | |
| description?: string; | |
| posted?: string; | |
| } | |
| interface CompanyConfig { | |
| name: string; | |
| url: string; | |
| type: 'greenhouse' | 'ashby' | 'lever' | 'direct'; | |
| } | |
| // Test companies from your list | |
| const testCompanies: CompanyConfig[] = [ | |
| { | |
| name: 'Stripe', | |
| url: 'https://boards.greenhouse.io/embed/job_board?for=Stripe', | |
| type: 'greenhouse' | |
| }, | |
| { | |
| name: 'PostHog', | |
| url: 'https://jobs.ashbyhq.com/posthog', | |
| type: 'ashby' | |
| }, | |
| { | |
| name: 'Spotify', | |
| url: 'https://jobs.lever.co/spotify', | |
| type: 'lever' | |
| } | |
| ]; | |
| async function fetchHTML(url: string): Promise<string> { | |
| try { | |
| const response = await fetch(url, { | |
| headers: { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
| } | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`HTTP error! status: ${response.status}`); | |
| } | |
| return await response.text(); | |
| } catch (error) { | |
| console.error(`Error fetching ${url}:`, error); | |
| return ''; | |
| } | |
| } | |
| function scrapeGreenhouse(html: string, companyName: string): JobListing[] { | |
| const dom = new JSDOM(html); | |
| const document = dom.window.document; | |
| const jobs: JobListing[] = []; | |
| // Greenhouse typically uses .opening class for job listings | |
| const jobElements = document.querySelectorAll('.opening, .job, [data-qa="opening"]'); | |
| jobElements.forEach(element => { | |
| try { | |
| const titleElement = element.querySelector('a, .opening-title, h3, h4'); | |
| const locationElement = element.querySelector('.location, .opening-location, [data-qa="location"]'); | |
| const linkElement = element.querySelector('a'); | |
| if (titleElement && linkElement) { | |
| const title = titleElement.textContent?.trim() || ''; | |
| const location = locationElement?.textContent?.trim() || 'Remote'; | |
| const href = linkElement.getAttribute('href') || ''; | |
| const fullUrl = href.startsWith('http') ? href : `https://boards.greenhouse.io${href}`; | |
| if (title) { | |
| jobs.push({ | |
| title, | |
| company: companyName, | |
| location, | |
| url: fullUrl | |
| }); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Error parsing job element:', error); | |
| } | |
| }); | |
| return jobs; | |
| } | |
| function scrapeAshby(html: string, companyName: string): JobListing[] { | |
| const dom = new JSDOM(html); | |
| const document = dom.window.document; | |
| const jobs: JobListing[] = []; | |
| // Ashby typically uses different selectors | |
| const jobElements = document.querySelectorAll('[data-testid="job-posting"], .job-posting, .ashby-job-posting'); | |
| jobElements.forEach(element => { | |
| try { | |
| const titleElement = element.querySelector('h3, h4, .job-title, [data-testid="job-title"]'); | |
| const locationElement = element.querySelector('.location, [data-testid="location"]'); | |
| const linkElement = element.querySelector('a'); | |
| if (titleElement && linkElement) { | |
| const title = titleElement.textContent?.trim() || ''; | |
| const location = locationElement?.textContent?.trim() || 'Remote'; | |
| const href = linkElement.getAttribute('href') || ''; | |
| const fullUrl = href.startsWith('http') ? href : `https://jobs.ashbyhq.com${href}`; | |
| if (title) { | |
| jobs.push({ | |
| title, | |
| company: companyName, | |
| location, | |
| url: fullUrl | |
| }); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Error parsing job element:', error); | |
| } | |
| }); | |
| return jobs; | |
| } | |
| function scrapeLever(html: string, companyName: string): JobListing[] { | |
| const dom = new JSDOM(html); | |
| const document = dom.window.document; | |
| const jobs: JobListing[] = []; | |
| // Lever typically uses .posting class | |
| const jobElements = document.querySelectorAll('.posting, .lever-posting'); | |
| jobElements.forEach(element => { | |
| try { | |
| const titleElement = element.querySelector('h5, .posting-title, .lever-posting-title'); | |
| const locationElement = element.querySelector('.location, .posting-location'); | |
| const linkElement = element.querySelector('a'); | |
| if (titleElement && linkElement) { | |
| const title = titleElement.textContent?.trim() || ''; | |
| const location = locationElement?.textContent?.trim() || 'Remote'; | |
| const href = linkElement.getAttribute('href') || ''; | |
| const fullUrl = href.startsWith('http') ? href : `https://jobs.lever.co${href}`; | |
| if (title) { | |
| jobs.push({ | |
| title, | |
| company: companyName, | |
| location, | |
| url: fullUrl | |
| }); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Error parsing job element:', error); | |
| } | |
| }); | |
| return jobs; | |
| } | |
| async function scrapeCompany(company: CompanyConfig): Promise<JobListing[]> { | |
| console.log(`\nπ Scraping ${company.name} (${company.type})...`); | |
| const html = await fetchHTML(company.url); | |
| if (!html) { | |
| console.log(`β Failed to fetch HTML for ${company.name}`); | |
| return []; | |
| } | |
| let jobs: JobListing[] = []; | |
| switch (company.type) { | |
| case 'greenhouse': | |
| jobs = scrapeGreenhouse(html, company.name); | |
| break; | |
| case 'ashby': | |
| jobs = scrapeAshby(html, company.name); | |
| break; | |
| case 'lever': | |
| jobs = scrapeLever(html, company.name); | |
| break; | |
| default: | |
| console.log(`β Unknown scraper type: ${company.type}`); | |
| return []; | |
| } | |
| console.log(`β Found ${jobs.length} jobs for ${company.name}`); | |
| // Log first few jobs as examples | |
| jobs.slice(0, 3).forEach(job => { | |
| console.log(` π ${job.title} - ${job.location}`); | |
| }); | |
| return jobs; | |
| } | |
| async function main() { | |
| console.log('π Starting job scraper test...'); | |
| const allJobs: JobListing[] = []; | |
| for (const company of testCompanies) { | |
| const jobs = await scrapeCompany(company); | |
| allJobs.push(...jobs); | |
| // Be respectful - wait between requests | |
| await new Promise(resolve => setTimeout(resolve, 2000)); | |
| } | |
| console.log(`\nπ Total jobs found: ${allJobs.length}`); | |
| // Group by company | |
| const jobsByCompany = allJobs.reduce((acc, job) => { | |
| if (!acc[job.company]) acc[job.company] = []; | |
| acc[job.company].push(job); | |
| return acc; | |
| }, {} as Record<string, JobListing[]>); | |
| console.log('\nπ Jobs by company:'); | |
| Object.entries(jobsByCompany).forEach(([company, jobs]) => { | |
| console.log(` ${company}: ${jobs.length} jobs`); | |
| }); | |
| // Save results to file for inspection | |
| const fs = await import('fs/promises'); | |
| await fs.writeFile('job-scraper-results.json', JSON.stringify(allJobs, null, 2)); | |
| console.log('\nπΎ Results saved to job-scraper-results.json'); | |
| } | |
| if (require.main === module) { | |
| main().catch(console.error); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment