Last active
April 14, 2023 09:50
-
-
Save myselfhimself/85897805926fe89d6cffb1511662dd30 to your computer and use it in GitHub Desktop.
Parish contact details scraping into CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Just paste everything in any web browser's developer console and press enter to obtain a CSV file | |
// This is intended for scraping parishes address details from a famous french city's website | |
// Taylor at your will | |
// Public domain - no attribution required | |
(() => { | |
let consoleLogBuffer = ""; | |
let console_log = (s) => (consoleLogBuffer += s + "\n"); | |
let csvSeparator = "|"; | |
let csvHeader = [ | |
"parishName", | |
"phoneNumber", | |
"email", | |
"recommendedPostAddress1", | |
"recommendedPostAddress2", | |
"postAddress1", | |
"postAddress2", | |
"churchAddress1", | |
"churchAddress2", | |
"address1", | |
"address2", | |
]; | |
let re = new RegExp("\\d{1,2}", "g"); | |
let asSpacedPhoneNumber = (phoneNumberRaw) => { | |
return phoneNumberRaw.replace(" ", "").match(re).join(" "); | |
}; | |
console_log(csvHeader.join(csvSeparator)); | |
let asCsvLine = (parishData) => { | |
return [ | |
"Paroisse " + parishData.name, | |
parishData.phoneNumber, | |
parishData.email, | |
parishData.postAddress?.join(csvSeparator) || | |
parishData.address?.join(csvSeparator) || | |
parishData.churchAddress?.join(csvSeparator) || | |
csvSeparator, | |
parishData.postAddress?.join(csvSeparator) || csvSeparator, | |
parishData.churchAddress?.join(csvSeparator) || csvSeparator, | |
parishData.address?.join(csvSeparator) || csvSeparator, | |
].join(csvSeparator); | |
}; | |
let asAddressObject = (rawAddressLine) => { | |
let ret = rawAddressLine.split(", "); | |
if (ret.length != 2) { | |
throw new Exception("address lines count issue: " + rawAddressLine); | |
} else { | |
return ret; | |
} | |
}; | |
$("h4").map(function () { | |
let parishData = {}; | |
parishData.name = $(this).text(); | |
if ( | |
[ | |
"Choisir un arrondissement", | |
"Prier, Célébrer", | |
"S’informer", | |
"Agir", | |
"Donner à l’Église", | |
].includes(parishData.name) | |
) { | |
return; | |
} | |
parishData.rawDetails = $(this) | |
.next("p") | |
.text() | |
.replace(/[\f\t\n\v\r]+| : /g, "|") | |
.split("|") | |
.filter(Boolean); | |
let i = 0; | |
do { | |
let current = parishData.rawDetails[i]?.trim(); | |
let next = | |
parishData.rawDetails[(i + 1) % parishData.rawDetails.length]?.trim(); | |
if (current === "Adresse de l'église") { | |
parishData.churchAddress = asAddressObject(next); | |
i += 2; | |
continue; | |
} else if (current == "Adresse postale") { | |
parishData.postAddress = asAddressObject(next); | |
i += 2; | |
continue; | |
} else if (current == "Adresse") { | |
parishData.address = asAddressObject(next); | |
i += 2; | |
continue; | |
} else if (current?.startsWith("Tél")) { | |
parishData.phoneNumber = asSpacedPhoneNumber(next); | |
i += 2; | |
continue; | |
} else if (current === "Email") { | |
parishData.email = next; | |
i += 2; | |
continue; | |
} else if (current === "Site") { | |
parishData.website = next; | |
i += 2; | |
continue; | |
} else { | |
i++; | |
} | |
} while (i < parishData.rawDetails.length); | |
console_log(asCsvLine(parishData)); | |
}); | |
console.log(consoleLogBuffer); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment