Skip to content

Instantly share code, notes, and snippets.

@sakymar
Created October 28, 2022 15:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sakymar/03436cb4de435933c6720d04c50c27ef to your computer and use it in GitHub Desktop.
Save sakymar/03436cb4de435933c6720d04c50c27ef to your computer and use it in GitHub Desktop.
const puppetter = require("puppeteer")
const fs = require("fs")
const script = async () => {
//this will open visibly a chronium window, this is useful to see what is going on and test stuff before the finalized script
const browser = await puppetter.launch({ headless: true })
const page = await browser.newPage()
await page.goto(
"https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/country:_A-B"
)
//Get the page where your country is between those : A-B • C-D-E-F • G-H-I-J-K • L-M-N-O • P-Q-R-S • T-U-V-W-Y-Z
await page.goto(
"https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/country:_A-B"
)
const cityLinks = await page.evaluate(() => {
const country = "YOUR COUNTRY"
//remove the elements between the countries and the cities to make the scrapping easier
const thumbs = document.querySelectorAll(".thumb")
thumbs.forEach((thumb) => thumb.remove())
//get the list of countries and cities
const countries = document.querySelectorAll(".mw-headline")
const countryIndex = Array.from(countries).findIndex((item) =>
item.innerText.includes(country)
)
const cityTables = document.querySelectorAll("h2 + table.wikitable")
const cityList = cityTables[countryIndex].querySelectorAll("tbody tr")
return Array.from(cityList).map((row) => row.querySelector("a")?.href)
})
//at this point you have the list of cities of your country a link for each
const data = []
//for each city we go to the page and get the data
for (let link of cityLinks.filter((item) => !!item)) {
await page.goto(link)
const cityData = await page.evaluate(() => {
const name = document.querySelector("h1").innerText
const values = { name }
//there are more types of data if you need
const labels = ["Average high", "Average precipitation", "sunshine hours"]
const tables = Array.from(
document.querySelectorAll("table.wikitable > tbody")
)
const table = tables.find((item) => item.innerText.includes("Climate"))
labels.forEach((label) => {
const data = Array.from(table?.children || {}).find((item) =>
item?.innerText?.includes(label)
)
const dataValues = Array.from(data?.children || {}).map(
(item, index) => ({
value: item?.innerText || "",
time:
table?.children?.["1"]?.children[String(index)]?.innerText || "",
})
)
values[label] = dataValues
})
return values
})
data.push(cityData)
}
//here you go, you have the data of all the major cities of your country
console.log(data)
}
script()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment