Created
April 5, 2012 21:52
-
-
Save cherenkov/2314469 to your computer and use it in GitHub Desktop.
スクレイピングでのXPathの使い方について質問です。 Web::Scrap.. - 人力検索はてな http://q.hatena.ne.jp/1333603060
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#スクレイピングでのXPathの使い方について質問です。 Web::Scrap.. - 人力検索はてな | |
#http://q.hatena.ne.jp/1333603060 | |
#よしいずの雑記帳 Web::Scraperを使ったPerlプログラムの例 | |
#http://yoshiiz.blog129.fc2.com/blog-entry-382.html | |
use strict; | |
use warnings; | |
use Web::Scraper; | |
use Data::Dumper; | |
my $html_content = <<"EOT"; | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
</head> | |
<body> | |
<table border="1" cellpadding="0" cellspacing="0" style="display:inline;"> | |
<tbody> | |
<tr> | |
<td> | |
<img height="290" src="" width="200" /> | |
</td> | |
<td bgcolor="#B7863E" width="250"> | |
<font size="1"> | |
<font color="000000">テスト</font> | |
</font> | |
<br /> | |
<big> | |
<b> | |
<font color="FFFFFF">テスト</font> | |
</b> | |
</big> | |
<font color="FFFFFF"> | |
<br /> | |
<b>テスト</b> | |
</font> | |
<br /> | |
<img src="" /> | |
<img src="" /> | |
<b>4</b> | |
<img src="" /> | |
<br /> | |
<b>テスト</b> | |
<br /> | |
ここの部分1 | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<hr /> | |
<div align="RIGHT"> | |
<a href="" target="_blank"> | |
<img border="0" height="1" src="" width="1" /> | |
<img src="" /> | |
<font size="1"></font> | |
</a> | |
<b>1200</b> | |
</div> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
<table border="1" cellpadding="0" cellspacing="0" style="display:inline;"> | |
<tbody> | |
<tr> | |
<td> | |
<img height="290" src="" width="200" /> | |
</td> | |
<td bgcolor="#B7863E" width="250"> | |
<font size="1"> | |
<font color="000000">テスト</font> | |
</font> | |
<br /> | |
<big> | |
<b> | |
<font color="FFFFFF">テスト</font> | |
</b> | |
</big> | |
<font color="FFFFFF"> | |
<br /> | |
<b>テスト</b> | |
</font> | |
<br /> | |
<img src="" /> | |
<img src="" /> | |
<b>6</b> | |
<img src="" /> | |
<br /> | |
<b>テスト</b> | |
<br /> | |
ここの部分2 | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<br /> | |
<hr /> | |
<div align="RIGHT"> | |
<a href="" target="_blank"> | |
<img border="0" height="1" src="" width="1" /> | |
<img src="" /> | |
<font size="1"></font> | |
</a> | |
<b>1900</b> | |
</div> | |
</td> | |
</tr> | |
</tbody> | |
</table> | |
</body> | |
</html> | |
EOT | |
my $res = scraper { | |
process '//td/text()[not(normalize-space(.)="")]', 'text1[]' => 'TEXT'; | |
}->scrape($html_content); | |
print Dumper $res; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$VAR1 = { | |
'text1' => [ | |
' ここの部分1 ', | |
' ここの部分2 ' | |
] | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment