Skip to content

Instantly share code, notes, and snippets.

@dpavlin
Created December 8, 2013 23:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dpavlin/7865097 to your computer and use it in GitHub Desktop.
Save dpavlin/7865097 to your computer and use it in GitHub Desktop.
Download Thomson master journal list
#!/usr/bin/perl
use warnings;
use strict;
use Mojo::UserAgent;
use Mojo::DOM;
my $ua = Mojo::UserAgent->new;
my @selection = qw( H Y B K D SS );
@selection = @ARGV if @ARGV;
foreach my $pc ( @selection ) {
my $page = 1;
while (1) {
warn "# $pc $page\n";
my $body = $ua->get('http://science.thomsonreuters.com/cgi-bin/jrnlst/jlresults.cgi?PC='.$pc.'&mode=print&Page='. $page )->res->body;
last if $body !~ m/<DT>/;
$page++;
my $dom = Mojo::DOM->new( $body );
foreach my $j ( split(/<DT>/, $body) ) {
$j =~ s{<ol>.*</ol>}{}s;
$j =~ s{<[^>]+>}{|}gs;
$j =~ s{[\n\r]+}{}gs;
$j =~ s{^\|(\d+)\.\s+}{$1|}gs;
$j =~ s{\|\s+}{|}gs;
$j =~ s{\|+}{|}gs;
$j =~ s{\s+ISSN:\s+}{|};
$j =~ s{\|$}{};
print "$pc|$j\n";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment