Last active
July 1, 2020 21:36
-
-
Save cmpute/4f6ddd0cd249edabd6aa3d5cbcfacfc9 to your computer and use it in GitHub Desktop.
astost_crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Before running the scripts, please set page loading order to descending by post date (instead of reply date) | |
''' | |
import browsercookie | |
import sqlalchemy | |
import tqdm | |
from random import uniform | |
from itertools import count as icount | |
from time import sleep | |
from sqlalchemy.orm import sessionmaker | |
from lxml import html | |
from models import Base, Forum, Thread, User | |
import pytz | |
from datetime import datetime, timezone, timedelta | |
ASTOST_ZERO = 996700380 # ASTOST's actual birthday | |
ASTOST_ZONE = pytz.timezone('Asia/Shanghai') | |
# Suppress InsecureRequestWarning and retry mechanisms | |
import requests | |
from requests.packages import urllib3 | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
urllib3.disable_warnings() | |
def requests_retry_session( | |
retries=5, | |
backoff_factor=0.3, | |
status_forcelist=(500, 502, 504), | |
session=None, | |
): | |
session = session or requests.Session() | |
retry = Retry( | |
total=retries, | |
read=retries, | |
connect=retries, | |
backoff_factor=backoff_factor, | |
status_forcelist=status_forcelist, | |
) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount('http://', adapter) | |
session.mount('https://', adapter) | |
return session | |
srequest = requests_retry_session() | |
# Logging settings | |
import logging | |
formatter = logging.Formatter('[%(asctime)s][%(module)s] %(message)s') | |
handler = logging.FileHandler('astost.log') | |
handler.setLevel(logging.INFO) | |
handler.setFormatter(formatter) | |
logger = logging.getLogger("crawler") | |
logger.setLevel(logging.INFO) | |
logger.handlers = [handler] | |
sqllogger = logging.getLogger('sqlalchemy') | |
sqllogger.handlers = [handler] | |
sqllogger.setLevel(logging.WARN) | |
# Setup client data | |
cookies = browsercookie.firefox() | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" | |
} | |
def get_forum_list(db): | |
logger.info("Site: fetching forum list") | |
tsession = sessionmaker(bind=db) | |
session = tsession() | |
response = srequest.get("https://www.astost.com/bbs/", cookies=cookies, headers=headers, verify=False, timeout=1) | |
page = html.fromstring(response.content) | |
if len(page.xpath("//form")) > 0: | |
raise RuntimeError("You need to login first!") | |
forum_rows = page.xpath("//tr[@class='tr3 f_one']") | |
forum_blocks = [] | |
forum_idlist = [] | |
for forum in forum_rows: | |
for entry in forum.xpath('.//h3/a'): | |
forum_blocks.append(entry) | |
for entry in forum.xpath(".//a[@class='style1']"): | |
forum_blocks.append(entry) | |
for entry in forum_blocks: | |
forum_name = ''.join(entry.itertext()).lstrip('【').rstrip('】') | |
forum_link = entry.attrib['href'] | |
forum_id = int(forum_link[forum_link.rfind('=')+1:]) | |
forum_idlist.append(forum_id) | |
if not session.query(Forum).filter(Forum.fid == forum_id).count(): | |
session.add(Forum(fid=forum_id, name=forum_name, latest_thread=None)) | |
logger.info("Site: total %d forums found" % len(forum_idlist)) | |
logger.info("Site: committing forum list") | |
session.commit() | |
return forum_idlist | |
def update_forum(db, fid, max_interval=3): | |
tsession = sessionmaker(bind=db) | |
session = tsession() | |
forum_entry = session.query(Forum).filter(Forum.fid == fid).one() | |
latest = forum_entry.latest_thread | |
if latest == None: | |
latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE) | |
else: | |
latest = ASTOST_ZONE.localize(latest) | |
thread_latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE) # Save the latest thread id | |
report = tqdm.tqdm(desc="Threads", unit='day', total=(datetime.now(ASTOST_ZONE)-latest).total_seconds()/86400) | |
thread_counter = 0 | |
for page in icount(1): | |
thread_ltime, stop_flag, count = get_forum_page(session, fid, report, page, latest) | |
thread_counter += count | |
thread_latest = thread_ltime if thread_ltime > thread_latest else thread_latest | |
if stop_flag: | |
logger.info("Forum %d @%d: reached lastest entry" % (fid, page)) | |
break | |
else: | |
ltime = uniform(0, max_interval) | |
logger.info("Forum %d @%d: sleep for %.2f seconds" % (fid, page, ltime)) | |
sleep(ltime) | |
report.close() | |
logger.info("Forum %d: find %d new threads" % (fid, thread_counter)) | |
logger.info("Forum %d: finished forum, committing" % fid) | |
if thread_latest < latest: | |
logger.warning("Latest thread ({}) is older than the one in database ({})," | |
" could be cause by thread deleted or hided".format(thread_latest, latest)) | |
forum_entry.latest_thread = thread_latest | |
session.commit() | |
def get_forum_page(session, fid, report, page=1, stop=0): | |
logger.info("Forum %d @%d: fetching pages..." % (fid, page)) | |
forum_addr = "https://www.astost.com/bbs/thread.php?fid={}&page={}".format(fid, page) | |
response = srequest.get(forum_addr, cookies=cookies, headers=headers, verify=False, timeout=5) | |
page_content = html.fromstring(response.content) | |
pages_block = page_content.xpath("//div[@class='pages']") | |
if len(pages_block) > 0: | |
pages_text = "".join(pages_block[0].itertext()) | |
pages_text_slash = pages_text.rfind('/') | |
page_count = int(pages_text[pages_text_slash+1 : pages_text.find(' ', pages_text_slash)]) | |
else: | |
page_count = 1 | |
thread_table = page_content.xpath("//table[@id='ajaxtable']/tbody/tr[@class='tr3 t_one']") | |
thread_counter = 0 | |
thread_latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE) | |
stop_flag = False # This flag is for early stop | |
for thread_tr in thread_table: | |
thread_iter = iter(thread_tr) | |
assert next(thread_iter).tag == 'th' | |
node_title = next(thread_iter) | |
thread_id = int(node_title.attrib['id'][3:]) | |
thread_name = ''.join(node_title.xpath(".//a[contains(@id, 'ajax')]")[0].itertext()) | |
node_author = next(thread_iter) | |
node_author_iter = iter(node_author) | |
node_author_link = next(node_author_iter) | |
node_author_addr = node_author_link.attrib['href'] | |
author_id = int(node_author_addr[node_author_addr.rfind('=')+1:]) | |
author_name = ''.join(node_author_link.itertext()) | |
thread_date = ''.join(next(node_author_iter).itertext()).strip() | |
thread_date = datetime.strptime(thread_date, '%Y-%m-%d') | |
thread_date = ASTOST_ZONE.localize(thread_date) | |
next(thread_iter) | |
next(thread_iter) | |
node_reply = next(thread_iter) | |
thread_update = ''.join(node_reply.xpath(".//a")[0].itertext()).strip() | |
thread_update = datetime.strptime(thread_update, '%Y-%m-%d %H:%M') | |
thread_update = ASTOST_ZONE.localize(thread_update) | |
# Decide whether to stop | |
thread_pinned = len(thread_tr.xpath(".//img[@alt='置顶帖标志']")) > 0 | |
if not thread_pinned: # Compare threads only for non pinned pose | |
thread_latest = thread_update if thread_update > thread_latest else thread_latest | |
if thread_update <= stop: | |
stop_flag = True | |
break | |
else: # Update progress bar, FIXME: sometimes progress will go back? | |
delta = (thread_update - stop).total_seconds() / 86400 | |
report.n = (report.total - delta) | |
report.refresh() | |
# Add thread and user entry, threads are usually sorted in webpage unless updated | |
if not session.query(Thread).filter(Thread.tid == thread_id).count(): | |
session.add(Thread(tid=thread_id, title=thread_name, time_create=thread_date, time_update=thread_update, author_uid=author_id, forum_fid=fid)) | |
thread_counter += 1 | |
elif not thread_pinned: | |
logger.warn("Duplicate thread id {}, could be caused by new post.".format(thread_id)) | |
if not session.query(User).filter(User.uid == author_id).count(): | |
session.add(User(uid=author_id, name=author_name)) | |
if page >= page_count: | |
stop_flag = True | |
return thread_latest, stop_flag, thread_counter | |
def main(max_interval=3): | |
engine = sqlalchemy.create_engine('sqlite:///astost.db3') | |
# Create tables | |
Base.metadata.create_all(engine) | |
# Fetch forum list | |
forum_idlist = get_forum_list(engine) | |
# Update each forum | |
for fid in tqdm.tqdm(forum_idlist, desc="Forums", unit="forum"): | |
update_forum(engine, fid, max_interval) | |
ltime = uniform(0, max_interval) | |
logger.info("Forum %d: finished, sleep for %.2f seconds..." % (fid, ltime)) | |
sleep(ltime) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, TIMESTAMP | |
from sqlalchemy.ext.declarative import declarative_base | |
Base = declarative_base() | |
class Forum(Base): | |
__tablename__ = 'forums' | |
fid = Column(Integer, primary_key=True) | |
name = Column(String) | |
latest_thread = Column(TIMESTAMP(timezone=True)) | |
def __repr__(self): | |
return "<Forum %s>" % self.name | |
class Thread(Base): | |
__tablename__ = "threads" | |
tid = Column(Integer, primary_key=True) | |
title = Column(String) | |
time_create = Column(TIMESTAMP(timezone=True)) # TODO: Change these two types to time | |
time_update = Column(TIMESTAMP(timezone=True)) | |
author_uid = Column(Integer, ForeignKey('users.uid')) | |
forum_fid = Column(Integer, ForeignKey('forums.fid')) | |
def __repr__(self): | |
return "<Thread %s>" % self.title | |
class User(Base): | |
__tablename__ = "users" | |
uid = Column(Integer, primary_key=True) | |
name = Column(String) | |
def __repr__(self): | |
return "<User %s>" % self.name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment