Skip to content

Instantly share code, notes, and snippets.

@turbotree
Created November 4, 2013 01:53
Show Gist options
  • Save turbotree/7296987 to your computer and use it in GitHub Desktop.
Save turbotree/7296987 to your computer and use it in GitHub Desktop.
利用jsoup获取网页图片链接并下载到本地
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class DownImages {
private static int COUNT = 0;
private static int DOWN_COUNT = 0;
public static void jsoupHTML(String urlPath) throws Exception{
Document doc = Jsoup.connect(urlPath).timeout(1000000).get();
//:当前页中的图片
Elements srcLinks = doc.select("img[src$=.jpg]");//以.jpg结尾的元素
for (Element link : srcLinks) {
//:剔除标签,只剩链接路径
String imagesPath = link.attr("src");
System.out.println("当前访问路径:"+imagesPath);
getImages(imagesPath, "d:/images/0000"+ ++COUNT +".jpg");
}
//:提取网站中所有的href连接
Elements linehrefs = doc.select("a[href]");
for (Element linehref : linehrefs) {
String lihr = linehref.attr("href");
if(lihr.length()>4){
String ht = lihr.substring(0, 4);
String htt = lihr.substring(0, 1);
if(!ht.equals("http") && htt.equals("/")){
lihr = urlPath + lihr;
}
if(lihr.substring(0, 4).equals("http")){
Document docs = Jsoup.connect(lihr).timeout(1000000).get();
Elements links = docs.select("img[src$=.jpg]");
for (Element link : links) {
//:剔除标签,只剩链接路径
String imagesPath = link.attr("src");
System.out.println("当前访问路径:"+imagesPath);
getImages(imagesPath, "d://images//0000"+ COUNT++ +".jpg");
}
}
}
}
}
/**
* @param urlPath 图片路径
* @throws Exception
*/
public static void getImages(String urlPath,String fileName) throws Exception{
URL url = new URL(urlPath);//:获取的路径
//:http协议连接对象
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setReadTimeout(6 * 10000);
if (conn.getResponseCode() <10000){
InputStream inputStream = conn.getInputStream();
byte[] data = readStream(inputStream);
if(data.length>(1024*10)){
FileOutputStream outputStream = new FileOutputStream(fileName);
outputStream.write(data);
System.err.println("第"+ ++DOWN_COUNT +"图片下载成功");
outputStream.close();
}
}
}
/**
* 读取url中数据,并以字节的形式返回
* @param inputStream
* @return
* @throws Exception
*/
public static byte[] readStream(InputStream inputStream) throws Exception{
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len = -1;
while((len = inputStream.read(buffer)) !=-1){
outputStream.write(buffer, 0, len);
}
outputStream.close();
inputStream.close();
return outputStream.toByteArray();
}
public static void main(String[] args) {
try {
String urlPath = "http://image.baidu.com/";
jsoupHTML(urlPath);
} catch (Exception e) {
e.printStackTrace();
}finally{
System.out.println("共访问"+COUNT+"张图片,其中下载"+DOWN_COUNT+"张图片");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment