Created
October 26, 2009 10:32
-
-
Save yuroyoro/218551 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.net.URL | |
import java.awt.image.BufferedImage | |
import javax.imageio.ImageIO | |
import java.io._ | |
import scala.xml._ | |
import scala.io.Source | |
object TumblrCrawler { | |
def main( args:Array[String] ):Unit= { | |
val tumblrUrl = "http://%s.tumblr.com/api/read".format( args.first ) | |
val r = """http\:\/\/.+/([^\/]+)*$""".r | |
val extR = """.+\.(.+)$""".r | |
def crawlingTumblrImages( cnt:Int ):Unit = { | |
val url = tumblrUrl + "?type=photo&start=%d&num=50".format( cnt ) | |
println( url ) | |
val source = Source.fromURL( url ) | |
val xml = XML.loadString( source.getLines.mkString ) | |
val photos = xml \\ "post" | |
photos size match { | |
// 取れなくなったら終了 | |
case 0 => None | |
// 画像をファイルに書き出して再帰 | |
case _ => | |
// もっとも解像度の高いURLを取り出す | |
for( photo <- photos ) { | |
val ps = photo \ "photo-url" | |
val imageUrl = ( ps.first /: ps ){ (p1:Node, p2:Node) => { | |
def getSize( node:Node ) = ( node \ "@max-width" text ).toInt | |
if( getSize( p1 ) > getSize( p2 ) ) p1 else p2 | |
}}.text | |
saveImage( imageUrl ) | |
} | |
crawlingTumblrImages( cnt + 50 ) | |
} | |
} | |
def saveImage( url:String ) = { | |
val r(fname) = url | |
val (ext,file) = fname match { | |
case extR(e) => (e, new File( fname ) ) | |
case _ => ( "png", new File( fname + ".png" ) ) | |
} | |
val img = ImageIO.read( new URL( url ) ) | |
ImageIO.write( img , ext , file ) | |
Thread.sleep( 1000 ) | |
println( "Download:" + fname ) | |
} | |
crawlingTumblrImages( 0 ) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment