Skip to content

Instantly share code, notes, and snippets.

@kindy61
Created October 6, 2009 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kindy61/203073 to your computer and use it in GitHub Desktop.
Save kindy61/203073 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use LWP;
use LWP::UserAgent ;
use LWP::Simple;
use HTML::Tree;
use URI::Escape;
use URI::QueryParam;
use URI;
my $searchDate = '20080101';
my $root = 'sipo';
my $remoteserv ="http://search.sipo.gov.cn/sipo/zljs/";
mkdir ($root);
&search_by_date($searchDate);
sub search_by_date
{
my $file = shift ;
my $searchword = "\%C9\%EA\%C7\%EB\%C8\%D5\%3D\%28$file%29";
my $out_file = "$root\/$file\/search_result.html"; # where to save it
my $locdir = "$root\/$file\/";
my $browser = LWP::UserAgent->new;
my $response = $browser->post(
'http://search.sipo.gov.cn/sipo/zljs/hyjs-jieguo.jsp',
# That's the URL that the real form submits to.
[
"recshu" => "2",
"searchword" => uri_unescape($searchword),
"flag3" => "1",
"pg" => "1",
"sign" => "0",
"textfield3" => "",
"textfield9" => "",
"textfield10" => "",
"textfield12" => $file,
"textfield4" => "",
"textfield5" => "",
"textfield2" => "",
"textfield11" => "",
"textfield6" => "",
"textfield7" => "",
"textfield8" => "",
"gjgb" => "",
"textfield13" => "",
"textfield141" => "",
"textfield142" => "",
"textfield143" => "",
]
);
die "Error: ", $response->status_line, "\n"
unless $response->is_success;
mkdir($locdir);
open(OUT, ">$out_file") || die "Can't write-open $out_file: $!";
binmode(OUT);
print OUT $response->content;
close(OUT);
my $linkurl;
my $localfile;
my $uri ;
my $prevurl="" ;
my $tree = HTML::TreeBuilder->new;
$tree->parse_file($out_file); # !
foreach my $link ( $tree->look_down(_tag=>'a',class=>'a01',sub{$_[0]->attr('href') =~ /recid/}))
{
if($link)
{
$linkurl = $link->attr('href'); # !
$linkurl = $remoteserv.$linkurl ;
$uri = URI->new($linkurl);
$localfile= "$locdir/".$uri->query_param('recid').".html";
if($linkurl ne $prevurl)
{
print $linkurl ,"\n";
print $localfile ,"\n";
getstore($linkurl, $localfile);
&getif($localfile,$locdir);
}
$prevurl = $linkurl ;
}
}
$tree->delete; # clear memory!
}
sub getif
{
my ($file,$locdir)=@_;
my $uri ;
my $name;
my $value ;
my $recid;
my $tifpath;
my $totalpage;
my $tifdir='';
my $tiffile;
my $savefile;
my $localfile ;
my $tree = HTML::TreeBuilder->new;
$tree->parse_file($file); # !
foreach my $input ( $tree->look_down(_tag=>'input',type=>'hidden'))
{
if($input)
{
$name = $input->attr('name'); # !
$value = $input->attr('value'); # !
if($name eq 'recid') {$recid = $value ;}
if($name eq 'tifpath') {$tifpath = $value ;}
if($name eq 'totalpage') {$totalpage = 0+$value ;}
}
}
$tree->delete; # clear memory!
$uri = URI->new($tifpath);
my @path=$uri->path_segments;
# There will always be an empty first component.
shift(@path);
pop(@path);
foreach my $dir (@path)
{
$tifdir .= $dir.'/' ;
}
mkdir("$locdir\/$recid");
for (my $i=1;$i<=$totalpage;$i++)
{
$savefile = sprintf("%06d.tif",$i);
$tiffile =$tifdir.$savefile;
$localfile ="$locdir$recid\/$savefile";
$uri->path($tiffile);
getstore($uri->as_string, $localfile);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment