Created
August 6, 2020 21:00
-
-
Save CliffordAnderson/4a274d6d447b10fde71a64ca7f755672 to your computer and use it in GitHub Desktop.
Web scraping for Summer Teams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(: Web Scraper for Summer Projects :) | |
declare option output:method "csv"; | |
declare option output:csv "header=yes, separator=comma"; | |
let $doc := fetch:text("https://www.library.vanderbilt.edu/projects") => html:parse() | |
let $mainContent := $doc//section[@id="maincontent"] | |
for $project in $mainContent//div[@class="media-body"] | |
let $projectName := $project/h3[@class="media-heading"]/text() | |
let $projectParticipants := $project//span[text()="Members"]/following-sibling::text() | |
let $participants := fn:translate($projectParticipants,":","") => fn:tokenize(",") | |
for $participant in $participants | |
let $name := fn:replace($participant,"^(.+?) ([^\s,]+)(,? (?:[JS]r\.?|III?|IV))?$", "$2,$1$3") | |
return | |
<csv> | |
<row> | |
<project>{$projectName}</project> | |
<participants>{$name}</participants> | |
</row> | |
</csv> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment