Created
February 18, 2016 01:43
-
-
Save woodywang/60b480496cbf3bc22852 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.woodywang.crawler.task; | |
import com.fasterxml.jackson.databind.JsonNode; | |
import com.fasterxml.jackson.databind.ObjectMapper; | |
import java.net.URL; | |
import java.util.ArrayDeque; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Queue; | |
/** | |
* Created by woody on 2/17/16. | |
*/ | |
public class BaiduCrawler { | |
private ObjectMapper objectMapper = new ObjectMapper(); | |
public static final long START_POINT_UID = 588345065; | |
private Queue<Long> uidQueue = new ArrayDeque<Long>(); | |
public static void main(String[] args) throws Exception { | |
BaiduCrawler self = new BaiduCrawler(); | |
self.start(); | |
} | |
public void start() throws Exception { | |
uidQueue.add(START_POINT_UID); | |
while (!uidQueue.isEmpty()) { | |
long uid = uidQueue.remove(); | |
List<Long> follows = getFollows(uid); | |
System.out.println(follows); | |
uidQueue.addAll(follows); | |
getResources(uid); | |
} | |
} | |
public List<Long> getFollows(long uid) throws Exception { | |
int limit = 24; | |
int offset = 0; | |
List<Long> result = new ArrayList<Long>(); | |
int count; | |
do { | |
count = 0; | |
String url = String.format("http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d", uid, limit, offset); | |
System.out.println("FollowList: " + url); | |
JsonNode json = objectMapper.readTree(new URL(url)); | |
if (json.get("follow_list") == null || !json.get("follow_list").isArray()) { | |
continue; | |
} | |
for (JsonNode info : json.get("follow_list")) { | |
long followUid = info.get("follow_uk").asLong(); | |
result.add(followUid); | |
count++; | |
offset += limit; | |
} | |
} while (count >= limit); | |
return result; | |
} | |
public List<Long> getFans(long uid) throws Exception { | |
int limit = 24; | |
int offset = 0; | |
List<Long> result = new ArrayList<Long>(); | |
int count; | |
do { | |
count = 0; | |
String url = String.format("http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d", uid, limit, offset); | |
System.out.println("FollowList: " + url); | |
JsonNode json = objectMapper.readTree(new URL(url)); | |
if (json.get("fans_list") == null || !json.get("fans_list").isArray()) { | |
continue; | |
} | |
for (JsonNode info : json.get("fans_list")) { | |
long followUid = info.get("fans_uk").asLong(); | |
result.add(followUid); | |
count++; | |
offset += limit; | |
} | |
} while (count >= limit); | |
return result; | |
} | |
public List<Resource> getResources(long uid) throws Exception { | |
int limit = 60; | |
List<Resource> resources = new ArrayList<Resource>(); | |
int offset = 0; | |
int count = 0; | |
do { | |
count = 0; | |
String url = String.format("http://pan.baidu.com/pcloud/feed/getsharelist?auth_type=1&start=%d&limit=%d&query_uk=%d", offset, limit, uid); | |
System.out.println(url); | |
// String jsonContent = fetchUrl(url); | |
JsonNode json = objectMapper.readTree(new URL(url)); | |
JsonNode records = json.get("records"); | |
if (records == null) { | |
continue; | |
} | |
for (JsonNode info : records) { | |
count++; | |
Resource resource = new Resource(); | |
if (info.get("shareid") == null) { | |
continue; | |
} | |
String shortUrl = info.get("shareid").asText(); | |
// http://pan.baidu.com/share/link?uk=588345065&shareid=4066092938 | |
// resource.setUrl("http://pan.baidu.com/s/" + shortUrl); | |
resource.setUrl("http://pan.baidu.com/share/link?uk=" + uid + "&shareid=" + shortUrl); | |
JsonNode fileList = info.get("filelist"); | |
for (JsonNode file : fileList) { | |
String fileName = file.get("server_filename").asText(); | |
resource.setFileName(fileName); | |
break; | |
} | |
resources.add(resource); | |
System.out.println(resource.getUrl() + " -- " + resource.getFileName()); | |
} | |
offset += limit; | |
Thread.sleep(3000 + (long) (Math.random() * 7000)); | |
} while (count >= limit); | |
return resources; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment