Skip to content

Instantly share code, notes, and snippets.

@AhmedSamy
Created February 23, 2024 12:01
Show Gist options
  • Save AhmedSamy/c11f502a60d162dbf4fadb4904065177 to your computer and use it in GitHub Desktop.
Save AhmedSamy/c11f502a60d162dbf4fadb4904065177 to your computer and use it in GitHub Desktop.
scrap.js
import { launchPuppeteer } from 'crawlee'
import path from 'path'
import { fileURLToPath } from 'url';
import { dirname } from 'path';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const companyName = 'Paylane GmbH'
// Launch the web browser.
const browser = await launchPuppeteer({
launchOptions: {
headless: false, defaultViewport: null, args: ['--start-maximized'],
}
})
// Create and navigate new page
console.log('Open target page')
const page = await browser.newPage()
await page.goto('https://www.handelsregister.de/rp_web/normalesuche.xhtml')
// Fill form fields and select desired search options
console.log('Fill in search form')
//take screenshot
await page.screenshot({ path: 'before_form.png' })
await page.type('#form\\:schlagwoerter', companyName)
await page.screenshot({ path: 'after_form.png' })
// Submit the form and wait for full load of next page
console.log('Submit search form')
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle2' }),
page.click('#form\\:btnSuche[type="submit"]'),
])
await page.screenshot({ path: 'after_submit.png' })
// Obtain and print list of search results
console.log('Extract search results')
//get all anchors with class dokumentList
const anchors = await page.$$('.dokumentList')
//print anchors text
let dkAnchor
for (let i = 0; i < anchors.length; i++) {
const value = await (await anchors[i].getProperty('innerText')).jsonValue()
if (value.includes('DK')) {
dkAnchor = anchors[i]
console.log('Found DK link:', value)
}
}
//click on the first DK link
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle2' }),
dkAnchor.click(),
])
// find a span that contains the text "List of shareholders"
let spans = await page.$$('span')
let rootUrlLink
for (let i = 0; i < spans.length; i++) {
const value = await (await spans[i].getProperty('innerText')).jsonValue()
if (value.includes('Documents on register number')) {
rootUrlLink = spans[i]
console.log('Span Found:', value)
}
}
console.log('Click on rootUrlLink:', rootUrlLink)
//click on rootUrlLink
await Promise.all([
rootUrlLink.click(),
])
spans = await page.$$('span')
let shareholderLink
for (let i = 0; i < spans.length; i++) {
const value = await (await spans[i].getProperty('innerText')).jsonValue()
if (value.includes('List of shareholders')) {
shareholderLink = spans[i]
console.log('Span Found list of shareholders:', value)
}
}
await page.screenshot({ path: 'after_span_search_shareholder.png', fullPage: true })
console.log('Click on shareholderLink:', shareholderLink)
//click on shareholderLink
await Promise.all([
shareholderLink.click(),
])
//wait 5 seconds
await page.waitForTimeout(2000)
//find a button with inner text "Download"
let buttons = await page.$$('button')
let downloadButton
for (let i = 0; i < buttons.length; i++) {
const value = await (await buttons[i].getProperty('innerText')).jsonValue()
if (value.includes('Download')) {
downloadButton = buttons[i]
console.log('Button Found:', value)
}
}
await page.screenshot({ path: 'after_button_search.png', fullPage: true })
console.log(downloadButton.asElement())
const client = await page.target().createCDPSession()
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: path.join(__dirname, 'downloads')
});
console.log('Click on downloadButton:', downloadButton)
//click on downloadButton
await Promise.all([
downloadButton.click(),
])
//wait 5 seconds
await page.waitForTimeout(5000)
// Close browser
await browser.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment