Last active
May 3, 2021 05:00
-
-
Save lacucaracha-jp/36a9198be0ee88d52262dafba2bb2c1d to your computer and use it in GitHub Desktop.
Hatena
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
import urllib.parse | |
import json | |
import sqlite3 | |
import time | |
from bs4 import BeautifulSoup | |
from datetime import datetime as dt | |
pdate = sys.argv[1] | |
db = sys.argv[2] | |
url = "https://b.hatena.ne.jp/hotentry/all/" + pdate | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
conn = sqlite3.connect(db) | |
c = conn.cursor() | |
elems = soup.find_all("div" , class_ = "entrylist-contents") | |
print("★★★★★" + pdate + "★★★★★") | |
for i,e in enumerate(elems): | |
purl = e.find("a").get("href") | |
burl = "https://b.hatena.ne.jp/entry/jsonlite/?url=" + urllib.parse.quote(purl) | |
Page = requests.get(burl).json() | |
count = Page["count"] | |
title = Page["title"] | |
eid = Page["eid"] | |
category = e.find("li" , class_ = "entrylist-contents-category").text.strip() | |
#EUCに変換できない文字が含まれる場合は標準出力出来ないため、エラーの場合は適当にスルー | |
try: | |
print(str(i+1) + ":" + str(count) + ":" + str(title)) | |
except Exception as e: | |
print(str(i+1) + ":" + str(count)) | |
check = c.execute('select count(*) "count" from Page where EID = ?',(eid,)).fetchone()[0] | |
if check == 1: | |
continue | |
c.execute("delete from Bookmark where EID = ?",(eid,)) | |
c.execute("delete from Star where EID = ?",(eid,)) | |
c.execute("insert into Page(EID,Date,category,Entryrank,Count,Title,URL) values(?,?,?,?,?,?,?)",(eid,dt.strptime(pdate,'%Y%m%d'),category,i+1,count,title,purl)) | |
for bookmark in Page["bookmarks"]: | |
buser = bookmark["user"] | |
btags = bookmark["tags"] | |
bcomment = bookmark["comment"] | |
btimestamp = dt.strptime(bookmark["timestamp"], '%Y/%m/%d %H:%M') | |
surl = "https://s.hatena.com/entry.json?uri=" + urllib.parse.quote("https://b.hatena.ne.jp/" + buser + "/" + btimestamp.strftime('%Y%m%d') + "#bookmark-" + eid) | |
StarData = [] | |
err_count = 0 | |
star_count = 0 | |
if(len(bcomment)!=0): | |
while err_count < 10 : | |
try: | |
Bookmark = requests.get(surl).json() | |
stars = Bookmark["entries"] | |
if len(stars)!=0: | |
for star in stars[0]["stars"]: | |
StarData.append((star["name"],None,buser,eid,star["quote"])) | |
star_count += 1 | |
if("colored_stars" in stars[0]): | |
for color_stars in stars[0]["colored_stars"]: | |
for color_star in color_stars["stars"]: | |
star_count += 1 | |
StarData.append((color_star["name"],color_stars["color"],buser,eid,color_star["quote"])) | |
break | |
except Exception as e: | |
print(surl) | |
print("エラー発生:" + str(err_count)) | |
err_count += 1 | |
time.sleep(err_count) | |
c.executemany('Insert into Star(STARUSER,COLOR,BOOKMARKUSER,EID,QUOTE) values(?,?,?,?,?)',StarData) | |
c.execute('Insert into Bookmark(EID,BOOKMARKUSER,URL,STARCOUNT,TIMESTAMP,COMMENT) values(?,?,?,?,?,?)',(eid,buser,purl,star_count,btimestamp,bcomment)) | |
conn.commit() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment