Skip to content

Instantly share code, notes, and snippets.

@woodywang
Created February 18, 2016 01:43
Show Gist options
  • Save woodywang/60b480496cbf3bc22852 to your computer and use it in GitHub Desktop.
Save woodywang/60b480496cbf3bc22852 to your computer and use it in GitHub Desktop.
package com.woodywang.crawler.task;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.net.URL;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.List;
import java.util.Queue;
/**
* Created by woody on 2/17/16.
*/
public class BaiduCrawler {
private ObjectMapper objectMapper = new ObjectMapper();
public static final long START_POINT_UID = 588345065;
private Queue<Long> uidQueue = new ArrayDeque<Long>();
public static void main(String[] args) throws Exception {
BaiduCrawler self = new BaiduCrawler();
self.start();
}
public void start() throws Exception {
uidQueue.add(START_POINT_UID);
while (!uidQueue.isEmpty()) {
long uid = uidQueue.remove();
List<Long> follows = getFollows(uid);
System.out.println(follows);
uidQueue.addAll(follows);
getResources(uid);
}
}
public List<Long> getFollows(long uid) throws Exception {
int limit = 24;
int offset = 0;
List<Long> result = new ArrayList<Long>();
int count;
do {
count = 0;
String url = String.format("http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d", uid, limit, offset);
System.out.println("FollowList: " + url);
JsonNode json = objectMapper.readTree(new URL(url));
if (json.get("follow_list") == null || !json.get("follow_list").isArray()) {
continue;
}
for (JsonNode info : json.get("follow_list")) {
long followUid = info.get("follow_uk").asLong();
result.add(followUid);
count++;
offset += limit;
}
} while (count >= limit);
return result;
}
public List<Long> getFans(long uid) throws Exception {
int limit = 24;
int offset = 0;
List<Long> result = new ArrayList<Long>();
int count;
do {
count = 0;
String url = String.format("http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d", uid, limit, offset);
System.out.println("FollowList: " + url);
JsonNode json = objectMapper.readTree(new URL(url));
if (json.get("fans_list") == null || !json.get("fans_list").isArray()) {
continue;
}
for (JsonNode info : json.get("fans_list")) {
long followUid = info.get("fans_uk").asLong();
result.add(followUid);
count++;
offset += limit;
}
} while (count >= limit);
return result;
}
public List<Resource> getResources(long uid) throws Exception {
int limit = 60;
List<Resource> resources = new ArrayList<Resource>();
int offset = 0;
int count = 0;
do {
count = 0;
String url = String.format("http://pan.baidu.com/pcloud/feed/getsharelist?auth_type=1&start=%d&limit=%d&query_uk=%d", offset, limit, uid);
System.out.println(url);
// String jsonContent = fetchUrl(url);
JsonNode json = objectMapper.readTree(new URL(url));
JsonNode records = json.get("records");
if (records == null) {
continue;
}
for (JsonNode info : records) {
count++;
Resource resource = new Resource();
if (info.get("shareid") == null) {
continue;
}
String shortUrl = info.get("shareid").asText();
// http://pan.baidu.com/share/link?uk=588345065&shareid=4066092938
// resource.setUrl("http://pan.baidu.com/s/" + shortUrl);
resource.setUrl("http://pan.baidu.com/share/link?uk=" + uid + "&shareid=" + shortUrl);
JsonNode fileList = info.get("filelist");
for (JsonNode file : fileList) {
String fileName = file.get("server_filename").asText();
resource.setFileName(fileName);
break;
}
resources.add(resource);
System.out.println(resource.getUrl() + " -- " + resource.getFileName());
}
offset += limit;
Thread.sleep(3000 + (long) (Math.random() * 7000));
} while (count >= limit);
return resources;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment