Created
February 20, 2016 03:35
-
-
Save lacucaracha-jp/fd0e7d857d6041ed53c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.math.BigDecimal; | |
import java.net.HttpURLConnection; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.net.URLEncoder; | |
import java.sql.DriverManager; | |
import java.sql.PreparedStatement; | |
import java.sql.ResultSet; | |
import java.text.SimpleDateFormat; | |
import java.util.Date; | |
import java.util.List; | |
import java.util.Locale; | |
import net.arnx.jsonic.JSON; | |
import org.apache.xmlrpc.client.XmlRpcClient; | |
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; | |
import org.jsoup.Jsoup; | |
/* | |
* To change this license header, choose License Headers in Project Properties. | |
* To change this template file, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
/** | |
* | |
* @author Amano | |
*/ | |
public class HatenaBookmark { | |
static InputStream in = null; | |
static HttpURLConnection hconn = null; | |
static String url = null; | |
static InputStream is = null; | |
static String str = null; | |
static org.jsoup.nodes.Document document; | |
static PreparedStatement pstmt; | |
static java.sql.Connection sconn = null; | |
static ResultSet rs; | |
static SimpleDateFormat bookmarkformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX", Locale.ENGLISH); | |
public static void main(String[] args) { | |
try { | |
String dbUrl = "jdbc:sqlite:" + args[2]; | |
String driver = "org.sqlite.JDBC"; | |
DriverManager.setLoginTimeout(5000); | |
Class.forName(driver).newInstance(); | |
sconn = DriverManager.getConnection(dbUrl); | |
sconn.setAutoCommit(false); | |
pstmt = sconn.prepareStatement("delete from bookmarks"); | |
pstmt.executeUpdate(); | |
pstmt = sconn.prepareStatement("delete from pages"); | |
pstmt.executeUpdate(); | |
pstmt = sconn.prepareStatement("delete from hatena_id"); | |
pstmt.executeUpdate(); | |
pstmt = sconn.prepareStatement("delete from blog_subscriber_id"); | |
pstmt.executeUpdate(); | |
sconn.commit(); | |
System.out.println("処理開始"); | |
getHatenaIdData(args[0]); | |
for (int i = 0; i < Integer.parseInt(args[1]); i++) { | |
System.out.println(i + ":" + recentUpdate()); | |
getBlogSubscriber(recentUpdate()); | |
} | |
} catch (Exception e) { | |
System.out.println(e); | |
} | |
} | |
static void insertBookmarkData(String burl, int pageCnt) { | |
for (int i = 0; i <= pageCnt - 1; i++) { | |
try { | |
url = "http://b.hatena.ne.jp/entrylist?sort=hot&layout=headline&url=" + burl + "&of=" + i * 20; | |
document = Jsoup.connect(url).timeout(100000).get(); | |
for (org.jsoup.nodes.Element element : document.select("li.entrylist-unit")) { | |
try { | |
url = URLEncoder.encode(element.getElementsByTag("a").attr("href"), "utf-8"); | |
url = "http://b.hatena.ne.jp/entry/jsonlite/?url=" + url; | |
hconn = (HttpURLConnection) new URL(url).openConnection();//サイトに接続 | |
hconn.setRequestMethod("GET");//プロトコルの設定 | |
hconn.setConnectTimeout(100000); | |
in = hconn.getInputStream();//ファイルを開く | |
str = convertString(in);//1行読み取り | |
Page page = JSON.decode(str, Page.class); | |
pstmt = sconn.prepareStatement("select * from PAGES where eid = ?"); | |
pstmt.setString(1, page.getEid()); | |
rs = pstmt.executeQuery(); | |
if (!rs.next()) { | |
for (Bookmark bookmark : page.getBookmarks()) { | |
System.out.println(page.getTitle() + "," + bookmark.getUser() + "," + bookmark.getComment()); | |
pstmt = sconn.prepareStatement("insert into BOOKMARKS(URL,EID,BOOKMARKUSER,STARCOUNT,TIMESTAMP,COMMENT,TAG) values(?,?,?,?,?,?,?) "); | |
pstmt.setString(1, page.getUrl()); | |
pstmt.setString(2, page.getEid()); | |
pstmt.setString(3, bookmark.getUser()); | |
pstmt.setInt(4, 0); | |
pstmt.setDate(5, new java.sql.Date(bookmark.getTimestamp().getTime())); | |
pstmt.setString(6, bookmark.getComment()); | |
pstmt.setString(7, bookmark.getTags()); | |
pstmt.executeUpdate(); | |
} | |
pstmt = sconn.prepareStatement("insert into PAGES(DATE,CATEGORY,EID,ENTRYRANK,COUNT,URL,TITLE) values(?,?,?,?,?,?,?) "); | |
pstmt.setDate(1, null); | |
pstmt.setString(2, element.select("li.category").text()); | |
pstmt.setString(3, page.getEid()); | |
pstmt.setBigDecimal(4, new BigDecimal(0)); | |
pstmt.setBigDecimal(5, new BigDecimal(page.getCount())); | |
pstmt.setString(6, page.getUrl()); | |
pstmt.setString(7, page.getTitle()); | |
pstmt.executeUpdate(); | |
sconn.commit(); | |
} | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
} | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
} | |
} | |
static void getBlogSubscriber(String blogurl) { | |
try { | |
String furl; | |
url = blogurl + "/about"; | |
document = Jsoup.connect(url).timeout(100000).get(); | |
//読者情報の取得 | |
for (org.jsoup.nodes.Element element : document.select("div.info").select("img.profile-icon")) { | |
pstmt = sconn.prepareStatement("insert into BLOG_SUBSCRIBER_ID(URL,SUBSCRIBER_ID) values(?,?) "); | |
pstmt.setString(1, blogurl); | |
pstmt.setString(2, element.getElementsByTag("img").attr("title")); | |
pstmt.executeUpdate(); | |
getHatenaIdData(element.getElementsByTag("img").attr("title")); | |
try { | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
} | |
//読者情報の取得 | |
int subscriber = 0; | |
try { | |
document = Jsoup.connect(url).timeout(100000).get(); | |
subscriber = Integer.parseInt(document.select("span.about-subscription-count").text().replaceAll(" 人", "").replaceAll(" people", "")); | |
} catch (Exception e) { | |
subscriber = 0; | |
} | |
//feedly情報の取得 | |
String feedlystr = ""; | |
int feedlycnt = 0; | |
try { | |
furl = "http://cloud.feedly.com/v3/search/feeds?query=" + URLEncoder.encode(blogurl, "utf-8"); | |
hconn = (HttpURLConnection) new URL(furl).openConnection();//サイトに接続 | |
hconn.setRequestMethod("GET");//プロトコルの設定 | |
hconn.setConnectTimeout(100000); | |
in = hconn.getInputStream();//ファイルを開く | |
feedlystr = convertString(in);//1行読み取り | |
feedlystr = feedlystr.replaceAll(".*\"subscribers\":", "").replaceAll(",\"lastUpdated\".*", ""); | |
feedlycnt = Integer.parseInt(feedlystr); | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
insertBookmarkData(blogurl, 5); | |
pstmt = sconn.prepareStatement("update HATENA_ID set BLOG_COUNT=?,FEEDLY_COUNT=? where BLOG_URL=?"); | |
pstmt.setInt(1, subscriber); | |
pstmt.setInt(2, feedlycnt); | |
pstmt.setString(3, blogurl); | |
pstmt.executeUpdate(); | |
sconn.commit(); | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
} | |
static void getHatenaIdData(String HatenaId) { | |
java.sql.Time postdate = null; | |
String url = null; | |
String blogurl = null; | |
String blogtitle = null; | |
int hatenacnt = 0; | |
try { | |
pstmt = sconn.prepareStatement("select * from HATENA_ID where ID = ?"); | |
pstmt.setString(1, HatenaId); | |
rs = pstmt.executeQuery(); | |
if (!rs.next()) { | |
try { | |
url = "http://blog.hatena.ne.jp/" + HatenaId; | |
document = Jsoup.connect(url).timeout(100000).get(); | |
blogurl = document.getElementsByTag("HTML").attr("data-blogs-uri-base"); | |
blogtitle = document.getElementsByTag("TITLE").text(); | |
postdate = new java.sql.Time(bookmarkformat.parse(document.getElementsByTag("TIME").first().attr("datetime")).getTime()); | |
hatenacnt = getCount(blogurl); | |
} catch (Exception e) { | |
} | |
pstmt = sconn.prepareStatement("insert into HATENA_ID(ID,BLOG_URL,LAST_POST,BLOG_TITLE,BLOG_COUNT,FEEDLY_COUNT,HATENA_COUNT,LAST_UPDATE)" | |
+ " values(?,?,?,?,?,?,?,?) "); | |
pstmt.setString(1, HatenaId); //1.ID | |
pstmt.setString(2, blogurl); //2.BLOG_URL | |
pstmt.setTime(3, postdate); //3.LAST_POST | |
pstmt.setString(4, blogtitle); //4.BLOG_TITLE | |
pstmt.setInt(5, 0); //5.BLOG_COUNT | |
pstmt.setInt(6, 0); //6.FEEDLY_COUNT | |
pstmt.setInt(7, hatenacnt); //7.HATENA_COUNT | |
pstmt.setTime(8, null); //8.LAST_UPDATE | |
pstmt.executeUpdate(); | |
sconn.commit(); | |
System.out.println(blogtitle + ":" + blogurl); | |
} | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
} | |
static String convertString(InputStream is) throws IOException { | |
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); | |
StringBuilder sb = new StringBuilder(); | |
char[] b = new char[1024]; | |
int line; | |
while (0 <= (line = reader.read(b))) { | |
sb.append(b, 0, line); | |
} | |
return sb.toString(); | |
} | |
static int getCount(String burl) { | |
String API_URL = "http://b.hatena.ne.jp/xmlrpc"; | |
String API_METHOD_NAME = "bookmark.getTotalCount"; | |
String URLS[] = {burl}; | |
XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); | |
try { | |
config.setServerURL(new URL(API_URL)); | |
} catch (MalformedURLException e) { | |
e.printStackTrace(); | |
return 0; | |
} | |
XmlRpcClient client = new XmlRpcClient(); | |
client.setConfig(config); | |
try { | |
Object result = client.execute(API_METHOD_NAME, URLS); | |
return Integer.parseInt(result.toString()); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
return 0; | |
} | |
static String recentUpdate() { | |
String BLOG_URL = ""; | |
try { | |
pstmt = sconn.prepareStatement("select t1.BLOG_URL from HATENA_ID t1 where t1.BLOG_COUNT = 0 and t1.LAST_POST is not null order by HATENA_COUNT desc"); | |
rs = pstmt.executeQuery(); | |
rs.next(); | |
BLOG_URL = rs.getString("BLOG_URL"); | |
} catch (Exception e) { | |
System.out.println(e.toString()); | |
} | |
return BLOG_URL; | |
} | |
} | |
class Bookmark { | |
private Date timestamp; | |
private String comment; | |
private String user; | |
private String tag; | |
public Date getTimestamp() { | |
return timestamp; | |
} | |
public String getComment() { | |
return comment; | |
} | |
public String getUser() { | |
return user; | |
} | |
public void setTimestamp(Date timestamp) { | |
this.timestamp = timestamp; | |
} | |
public void setComment(String comment) { | |
this.comment = comment; | |
} | |
public void setUser(String user) { | |
this.user = user; | |
} | |
public String getTags() { | |
return tag; | |
} | |
public void setTags(String tag) { | |
this.tag = tag; | |
} | |
} | |
class Page { | |
public int count; | |
public String url; | |
public String eid; | |
public String title; | |
public String screenshot; | |
public String entry_url; | |
private List<Bookmark> bookmarkList; | |
public void setBookmarks(List<Bookmark> bookmarkList) { | |
this.bookmarkList = bookmarkList; | |
} | |
public List<Bookmark> getBookmarks() { | |
return bookmarkList; | |
} | |
public int getCount() { | |
return count; | |
} | |
public String getUrl() { | |
return url; | |
} | |
public String getEid() { | |
return eid; | |
} | |
public String getTitle() { | |
return title; | |
} | |
public String getScreenshot() { | |
return screenshot; | |
} | |
public String getEntry_url() { | |
return entry_url; | |
} | |
public void setCount(int count) { | |
this.count = count; | |
} | |
public void setUrl(String url) { | |
this.url = url; | |
} | |
public void setEid(String eid) { | |
this.eid = eid; | |
} | |
public void setTitle(String title) { | |
this.title = title; | |
} | |
public void setScreenshot(String screenshot) { | |
this.screenshot = screenshot; | |
} | |
public void setEntry_url(String entry_url) { | |
this.entry_url = entry_url; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment