Created
October 6, 2009 14:36
-
-
Save kindy61/203073 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use LWP; | |
use LWP::UserAgent ; | |
use LWP::Simple; | |
use HTML::Tree; | |
use URI::Escape; | |
use URI::QueryParam; | |
use URI; | |
my $searchDate = '20080101'; | |
my $root = 'sipo'; | |
my $remoteserv ="http://search.sipo.gov.cn/sipo/zljs/"; | |
mkdir ($root); | |
&search_by_date($searchDate); | |
sub search_by_date | |
{ | |
my $file = shift ; | |
my $searchword = "\%C9\%EA\%C7\%EB\%C8\%D5\%3D\%28$file%29"; | |
my $out_file = "$root\/$file\/search_result.html"; # where to save it | |
my $locdir = "$root\/$file\/"; | |
my $browser = LWP::UserAgent->new; | |
my $response = $browser->post( | |
'http://search.sipo.gov.cn/sipo/zljs/hyjs-jieguo.jsp', | |
# That's the URL that the real form submits to. | |
[ | |
"recshu" => "2", | |
"searchword" => uri_unescape($searchword), | |
"flag3" => "1", | |
"pg" => "1", | |
"sign" => "0", | |
"textfield3" => "", | |
"textfield9" => "", | |
"textfield10" => "", | |
"textfield12" => $file, | |
"textfield4" => "", | |
"textfield5" => "", | |
"textfield2" => "", | |
"textfield11" => "", | |
"textfield6" => "", | |
"textfield7" => "", | |
"textfield8" => "", | |
"gjgb" => "", | |
"textfield13" => "", | |
"textfield141" => "", | |
"textfield142" => "", | |
"textfield143" => "", | |
] | |
); | |
die "Error: ", $response->status_line, "\n" | |
unless $response->is_success; | |
mkdir($locdir); | |
open(OUT, ">$out_file") || die "Can't write-open $out_file: $!"; | |
binmode(OUT); | |
print OUT $response->content; | |
close(OUT); | |
my $linkurl; | |
my $localfile; | |
my $uri ; | |
my $prevurl="" ; | |
my $tree = HTML::TreeBuilder->new; | |
$tree->parse_file($out_file); # ! | |
foreach my $link ( $tree->look_down(_tag=>'a',class=>'a01',sub{$_[0]->attr('href') =~ /recid/})) | |
{ | |
if($link) | |
{ | |
$linkurl = $link->attr('href'); # ! | |
$linkurl = $remoteserv.$linkurl ; | |
$uri = URI->new($linkurl); | |
$localfile= "$locdir/".$uri->query_param('recid').".html"; | |
if($linkurl ne $prevurl) | |
{ | |
print $linkurl ,"\n"; | |
print $localfile ,"\n"; | |
getstore($linkurl, $localfile); | |
&getif($localfile,$locdir); | |
} | |
$prevurl = $linkurl ; | |
} | |
} | |
$tree->delete; # clear memory! | |
} | |
sub getif | |
{ | |
my ($file,$locdir)=@_; | |
my $uri ; | |
my $name; | |
my $value ; | |
my $recid; | |
my $tifpath; | |
my $totalpage; | |
my $tifdir=''; | |
my $tiffile; | |
my $savefile; | |
my $localfile ; | |
my $tree = HTML::TreeBuilder->new; | |
$tree->parse_file($file); # ! | |
foreach my $input ( $tree->look_down(_tag=>'input',type=>'hidden')) | |
{ | |
if($input) | |
{ | |
$name = $input->attr('name'); # ! | |
$value = $input->attr('value'); # ! | |
if($name eq 'recid') {$recid = $value ;} | |
if($name eq 'tifpath') {$tifpath = $value ;} | |
if($name eq 'totalpage') {$totalpage = 0+$value ;} | |
} | |
} | |
$tree->delete; # clear memory! | |
$uri = URI->new($tifpath); | |
my @path=$uri->path_segments; | |
# There will always be an empty first component. | |
shift(@path); | |
pop(@path); | |
foreach my $dir (@path) | |
{ | |
$tifdir .= $dir.'/' ; | |
} | |
mkdir("$locdir\/$recid"); | |
for (my $i=1;$i<=$totalpage;$i++) | |
{ | |
$savefile = sprintf("%06d.tif",$i); | |
$tiffile =$tifdir.$savefile; | |
$localfile ="$locdir$recid\/$savefile"; | |
$uri->path($tiffile); | |
getstore($uri->as_string, $localfile); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment