Skip to content

Instantly share code, notes, and snippets.

@kardeiz
Created February 9, 2012 16:43
Show Gist options
  • Save kardeiz/1781007 to your computer and use it in GitHub Desktop.
Save kardeiz/1781007 to your computer and use it in GitHub Desktop.
powershell script to harvest all DC metadata from OAI server. returns xml and csv files
# define the oai server and base request string
$baseurl = "http://digitalrepository.smu.edu/cgi/oai2.cgi/OAI-script?"
$payload = "verb=ListRecords&metadataPrefix=oai_dc"
# set up the webclient to grab the xml. some servers reject oai-pmh requests with no user-agent specified
$wc = New-Object System.Net.WebClient
$wc.Encoding = [Text.Encoding]::UTF8
$wc.Headers.add("User-Agent", "PowerShell Script")
# set the first request and load the xml into variable. note: this assumes server response is xml
$url = $baseurl + $payload
[xml]$oaicont = $wc.downloadstring($url)
# define the node containing all records. we'll append records from additional pages to this node
$apnode = $oaicont.selectsinglenode("/*/*[local-name()=`"ListRecords`"]")
#find the resumptionToken
$rt = $oaicont.selectsinglenode("/*/*[local-name()=`"ListRecords`"]/*[local-name()=`"resumptionToken`"]")
# do this while found pages have resumptionTokens
while ($rt.haschildnodes -eq $true)
{
# add a timeout so the OAI server doesn't get mad
Start-Sleep -s 10
# set new URL request based on rt, load xml, and get new resumptionToken
$url = $baseurl + "verb=ListRecords&resumptionToken=" + $rt.innertext
$wc.Headers.add("User-Agent", "PowerShell Script")
[xml]$oainext = $wc.downloadstring($url)
$rt = $oainext.selectsinglenode("/*/*[local-name()=`"ListRecords`"]/*[local-name()=`"resumptionToken`"]")
# get all additional records and append them to the apnode
$innerel = $oainext.selectnodes("/*/*[local-name()=`"ListRecords`"]/*")
foreach ($inone in $innerel)
{
$inone = $oaicont.importnode($inone, $true)
$apnode.appendchild($inone)
}
}
# build a pretty printing xml writer
$xws = new-object system.xml.XmlWriterSettings
$xws.Indent = $true
$xws.indentchars = "`t"
$xtw = [system.xml.XmlWriter]::create("path to output xml", $xws)
$oaicont.WriteContentTo($xtw)
$xtw.flush()
$xtw.dispose()
# transform this xml file to csv using the
$xslt = New-Object System.Xml.Xsl.XslCompiledTransform
$xslt.load( "path to stylesheet, e.g., oai2csv.xsl", [System.Xml.Xsl.XsltSettings]::TrustedXslt, (New-Object System.Xml.XmlUrlResolver) )
$xslt.Transform( "path to output xml", "path to output csv" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment