Skip to content

Instantly share code, notes, and snippets.

@kanzure
Created January 16, 2013 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanzure/4551321 to your computer and use it in GitHub Desktop.
Save kanzure/4551321 to your computer and use it in GitHub Desktop.
sciencemag.org downloader
#!/usr/bin/perl
# Scraper for Science Magazine
# Needs curl and grep and mkdir
# Just use it on Linux
# Downloads paper and all available supplementary data
# including movies and tables
# Also downloads Table of Contents for each issue
# Should store all the data properly so one can browse from
# each TOC
#issue
$x = 6116;
#volume
$y = 339;
for(;$x >= 1; $x--)
{
#let us not DDoS them
sleep(5);
$url = "http://www.sciencemag.org/content/$y/$x.toc";
$name = "$x\.toc";
`curl -s --retry 3 -A GoogleBot $url -o $name`;
# Last issue in volume goto next volume
# probably could be done more elegantly
if(`grep "Content not found" $name`)
{
$y--;
$url = "http://www.sciencemag.org/content/$y/$x.toc";
$name = "$x\.$y";
`curl -s --retry 3 -A GoogleBot $url -o $name`;
}
@supp = `grep -B 1 "Supporting Online Material" $name`;
@supp1 = `grep suppl $name`;
push(@supp, @supp1);
@stuff = `grep "Full Text (PDF)" $name`;
`mkdir $y`;
`mkdir $y/$x`;
#time to parse webpages to find the papers and files
foreach $mag (@stuff)
{
@file = split(/\"/,$mag);
if(grep(/last/,$mag)) { $bob = "$file[3]"; }
else{
#print $bob = "$file[1]";
}
$name = "http://www.sciencemag.org$bob";
@final = split(/\//,$bob);
`curl -s -A GoogleBot $name -o $y/$x/$final[4]`;
}
#Find supplemental materials
foreach $sup (@supp)
{
@file = split(/\"/,$sup);
if(grep(/last/,$sup)) { $bob = "$file[3]"; }
else{ $bob = "$file[1]"; }
@dirs = split(/\//, $bob);
`mkdir ./$dirs[1]/$dirs[2]`;
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]`;
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]`;
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]/$dirs[5]`;
$name = "http://www.sciencemag.org$bob";
`curl -s -A GoogleBot $name -o .$bob`;
@morestuff = `grep "Download Supplement" .$bob`;
@movies = `grep "Movie S" .$bob`;
@tables = `grep "Table S" .$bob`;
foreach $more (@morestuff)
{
@file = split(/\"/,$more);
$bob = "$file[3]";
if(grep(/science/, $bob)) {@final = split(/\//,$bob);$finalname = "$final[7]"; }
else { @final = split(/\./,$bob); $finalname = "Supp\.$final[2]\.pdf"; }
$name = "http://www.sciencemag.org$bob";
`curl -s -A GoogleBot $name -o $y/$x/$finalname`;
}
foreach $mov (@movies)
{
@file = split(/\"/,$mov);
$bob = "$file[3]";
@final = split(/\//,$bob);$finalname = "$final[7]";
`mkdir ./$final[1]/$final[2]`;
`mkdir ./$final[1]/$final[2]/$final[3]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
$name = "http://www.sciencemag.org$bob";
`curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
}
foreach $tab (@tables)
{
@file = split(/\"/,$tab);
$bob = "$file[3]";
@final = split(/\//,$bob);$finalname = "$final[7]";
`mkdir ./$final[1]/$final[2]`;
`mkdir ./$final[1]/$final[2]/$final[3]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
$name = "http://www.sciencemag.org$bob";
`curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment