Skip to content

Instantly share code, notes, and snippets.

@CliffordAnderson
Created August 6, 2020 21:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CliffordAnderson/4a274d6d447b10fde71a64ca7f755672 to your computer and use it in GitHub Desktop.
Save CliffordAnderson/4a274d6d447b10fde71a64ca7f755672 to your computer and use it in GitHub Desktop.
Web scraping for Summer Teams
(: Web Scraper for Summer Projects :)
declare option output:method "csv";
declare option output:csv "header=yes, separator=comma";
let $doc := fetch:text("https://www.library.vanderbilt.edu/projects") => html:parse()
let $mainContent := $doc//section[@id="maincontent"]
for $project in $mainContent//div[@class="media-body"]
let $projectName := $project/h3[@class="media-heading"]/text()
let $projectParticipants := $project//span[text()="Members"]/following-sibling::text()
let $participants := fn:translate($projectParticipants,":","") => fn:tokenize(",")
for $participant in $participants
let $name := fn:replace($participant,"^(.+?) ([^\s,]+)(,? (?:[JS]r\.?|III?|IV))?$", "$2,$1$3")
return
<csv>
<row>
<project>{$projectName}</project>
<participants>{$name}</participants>
</row>
</csv>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment