-
-
Save sakymar/03436cb4de435933c6720d04c50c27ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppetter = require("puppeteer") | |
const fs = require("fs") | |
const script = async () => { | |
//this will open visibly a chronium window, this is useful to see what is going on and test stuff before the finalized script | |
const browser = await puppetter.launch({ headless: true }) | |
const page = await browser.newPage() | |
await page.goto( | |
"https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/country:_A-B" | |
) | |
//Get the page where your country is between those : A-B • C-D-E-F • G-H-I-J-K • L-M-N-O • P-Q-R-S • T-U-V-W-Y-Z | |
await page.goto( | |
"https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/country:_A-B" | |
) | |
const cityLinks = await page.evaluate(() => { | |
const country = "YOUR COUNTRY" | |
//remove the elements between the countries and the cities to make the scrapping easier | |
const thumbs = document.querySelectorAll(".thumb") | |
thumbs.forEach((thumb) => thumb.remove()) | |
//get the list of countries and cities | |
const countries = document.querySelectorAll(".mw-headline") | |
const countryIndex = Array.from(countries).findIndex((item) => | |
item.innerText.includes(country) | |
) | |
const cityTables = document.querySelectorAll("h2 + table.wikitable") | |
const cityList = cityTables[countryIndex].querySelectorAll("tbody tr") | |
return Array.from(cityList).map((row) => row.querySelector("a")?.href) | |
}) | |
//at this point you have the list of cities of your country a link for each | |
const data = [] | |
//for each city we go to the page and get the data | |
for (let link of cityLinks.filter((item) => !!item)) { | |
await page.goto(link) | |
const cityData = await page.evaluate(() => { | |
const name = document.querySelector("h1").innerText | |
const values = { name } | |
//there are more types of data if you need | |
const labels = ["Average high", "Average precipitation", "sunshine hours"] | |
const tables = Array.from( | |
document.querySelectorAll("table.wikitable > tbody") | |
) | |
const table = tables.find((item) => item.innerText.includes("Climate")) | |
labels.forEach((label) => { | |
const data = Array.from(table?.children || {}).find((item) => | |
item?.innerText?.includes(label) | |
) | |
const dataValues = Array.from(data?.children || {}).map( | |
(item, index) => ({ | |
value: item?.innerText || "", | |
time: | |
table?.children?.["1"]?.children[String(index)]?.innerText || "", | |
}) | |
) | |
values[label] = dataValues | |
}) | |
return values | |
}) | |
data.push(cityData) | |
} | |
//here you go, you have the data of all the major cities of your country | |
console.log(data) | |
} | |
script() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment