Skip to content

Instantly share code, notes, and snippets.

@cmpute
Last active July 1, 2020 21:36
Show Gist options
  • Save cmpute/4f6ddd0cd249edabd6aa3d5cbcfacfc9 to your computer and use it in GitHub Desktop.
Save cmpute/4f6ddd0cd249edabd6aa3d5cbcfacfc9 to your computer and use it in GitHub Desktop.
astost_crawler
'''
Before running the scripts, please set page loading order to descending by post date (instead of reply date)
'''
import browsercookie
import sqlalchemy
import tqdm
from random import uniform
from itertools import count as icount
from time import sleep
from sqlalchemy.orm import sessionmaker
from lxml import html
from models import Base, Forum, Thread, User
import pytz
from datetime import datetime, timezone, timedelta
ASTOST_ZERO = 996700380 # ASTOST's actual birthday
ASTOST_ZONE = pytz.timezone('Asia/Shanghai')
# Suppress InsecureRequestWarning and retry mechanisms
import requests
from requests.packages import urllib3
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
urllib3.disable_warnings()
def requests_retry_session(
retries=5,
backoff_factor=0.3,
status_forcelist=(500, 502, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
srequest = requests_retry_session()
# Logging settings
import logging
formatter = logging.Formatter('[%(asctime)s][%(module)s] %(message)s')
handler = logging.FileHandler('astost.log')
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger = logging.getLogger("crawler")
logger.setLevel(logging.INFO)
logger.handlers = [handler]
sqllogger = logging.getLogger('sqlalchemy')
sqllogger.handlers = [handler]
sqllogger.setLevel(logging.WARN)
# Setup client data
cookies = browsercookie.firefox()
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}
def get_forum_list(db):
logger.info("Site: fetching forum list")
tsession = sessionmaker(bind=db)
session = tsession()
response = srequest.get("https://www.astost.com/bbs/", cookies=cookies, headers=headers, verify=False, timeout=1)
page = html.fromstring(response.content)
if len(page.xpath("//form")) > 0:
raise RuntimeError("You need to login first!")
forum_rows = page.xpath("//tr[@class='tr3 f_one']")
forum_blocks = []
forum_idlist = []
for forum in forum_rows:
for entry in forum.xpath('.//h3/a'):
forum_blocks.append(entry)
for entry in forum.xpath(".//a[@class='style1']"):
forum_blocks.append(entry)
for entry in forum_blocks:
forum_name = ''.join(entry.itertext()).lstrip('【').rstrip('】')
forum_link = entry.attrib['href']
forum_id = int(forum_link[forum_link.rfind('=')+1:])
forum_idlist.append(forum_id)
if not session.query(Forum).filter(Forum.fid == forum_id).count():
session.add(Forum(fid=forum_id, name=forum_name, latest_thread=None))
logger.info("Site: total %d forums found" % len(forum_idlist))
logger.info("Site: committing forum list")
session.commit()
return forum_idlist
def update_forum(db, fid, max_interval=3):
tsession = sessionmaker(bind=db)
session = tsession()
forum_entry = session.query(Forum).filter(Forum.fid == fid).one()
latest = forum_entry.latest_thread
if latest == None:
latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE)
else:
latest = ASTOST_ZONE.localize(latest)
thread_latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE) # Save the latest thread id
report = tqdm.tqdm(desc="Threads", unit='day', total=(datetime.now(ASTOST_ZONE)-latest).total_seconds()/86400)
thread_counter = 0
for page in icount(1):
thread_ltime, stop_flag, count = get_forum_page(session, fid, report, page, latest)
thread_counter += count
thread_latest = thread_ltime if thread_ltime > thread_latest else thread_latest
if stop_flag:
logger.info("Forum %d @%d: reached lastest entry" % (fid, page))
break
else:
ltime = uniform(0, max_interval)
logger.info("Forum %d @%d: sleep for %.2f seconds" % (fid, page, ltime))
sleep(ltime)
report.close()
logger.info("Forum %d: find %d new threads" % (fid, thread_counter))
logger.info("Forum %d: finished forum, committing" % fid)
if thread_latest < latest:
logger.warning("Latest thread ({}) is older than the one in database ({}),"
" could be cause by thread deleted or hided".format(thread_latest, latest))
forum_entry.latest_thread = thread_latest
session.commit()
def get_forum_page(session, fid, report, page=1, stop=0):
logger.info("Forum %d @%d: fetching pages..." % (fid, page))
forum_addr = "https://www.astost.com/bbs/thread.php?fid={}&page={}".format(fid, page)
response = srequest.get(forum_addr, cookies=cookies, headers=headers, verify=False, timeout=5)
page_content = html.fromstring(response.content)
pages_block = page_content.xpath("//div[@class='pages']")
if len(pages_block) > 0:
pages_text = "".join(pages_block[0].itertext())
pages_text_slash = pages_text.rfind('/')
page_count = int(pages_text[pages_text_slash+1 : pages_text.find(' ', pages_text_slash)])
else:
page_count = 1
thread_table = page_content.xpath("//table[@id='ajaxtable']/tbody/tr[@class='tr3 t_one']")
thread_counter = 0
thread_latest = datetime.fromtimestamp(ASTOST_ZERO, tz=ASTOST_ZONE)
stop_flag = False # This flag is for early stop
for thread_tr in thread_table:
thread_iter = iter(thread_tr)
assert next(thread_iter).tag == 'th'
node_title = next(thread_iter)
thread_id = int(node_title.attrib['id'][3:])
thread_name = ''.join(node_title.xpath(".//a[contains(@id, 'ajax')]")[0].itertext())
node_author = next(thread_iter)
node_author_iter = iter(node_author)
node_author_link = next(node_author_iter)
node_author_addr = node_author_link.attrib['href']
author_id = int(node_author_addr[node_author_addr.rfind('=')+1:])
author_name = ''.join(node_author_link.itertext())
thread_date = ''.join(next(node_author_iter).itertext()).strip()
thread_date = datetime.strptime(thread_date, '%Y-%m-%d')
thread_date = ASTOST_ZONE.localize(thread_date)
next(thread_iter)
next(thread_iter)
node_reply = next(thread_iter)
thread_update = ''.join(node_reply.xpath(".//a")[0].itertext()).strip()
thread_update = datetime.strptime(thread_update, '%Y-%m-%d %H:%M')
thread_update = ASTOST_ZONE.localize(thread_update)
# Decide whether to stop
thread_pinned = len(thread_tr.xpath(".//img[@alt='置顶帖标志']")) > 0
if not thread_pinned: # Compare threads only for non pinned pose
thread_latest = thread_update if thread_update > thread_latest else thread_latest
if thread_update <= stop:
stop_flag = True
break
else: # Update progress bar, FIXME: sometimes progress will go back?
delta = (thread_update - stop).total_seconds() / 86400
report.n = (report.total - delta)
report.refresh()
# Add thread and user entry, threads are usually sorted in webpage unless updated
if not session.query(Thread).filter(Thread.tid == thread_id).count():
session.add(Thread(tid=thread_id, title=thread_name, time_create=thread_date, time_update=thread_update, author_uid=author_id, forum_fid=fid))
thread_counter += 1
elif not thread_pinned:
logger.warn("Duplicate thread id {}, could be caused by new post.".format(thread_id))
if not session.query(User).filter(User.uid == author_id).count():
session.add(User(uid=author_id, name=author_name))
if page >= page_count:
stop_flag = True
return thread_latest, stop_flag, thread_counter
def main(max_interval=3):
engine = sqlalchemy.create_engine('sqlite:///astost.db3')
# Create tables
Base.metadata.create_all(engine)
# Fetch forum list
forum_idlist = get_forum_list(engine)
# Update each forum
for fid in tqdm.tqdm(forum_idlist, desc="Forums", unit="forum"):
update_forum(engine, fid, max_interval)
ltime = uniform(0, max_interval)
logger.info("Forum %d: finished, sleep for %.2f seconds..." % (fid, ltime))
sleep(ltime)
if __name__ == "__main__":
main()
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, TIMESTAMP
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Forum(Base):
__tablename__ = 'forums'
fid = Column(Integer, primary_key=True)
name = Column(String)
latest_thread = Column(TIMESTAMP(timezone=True))
def __repr__(self):
return "<Forum %s>" % self.name
class Thread(Base):
__tablename__ = "threads"
tid = Column(Integer, primary_key=True)
title = Column(String)
time_create = Column(TIMESTAMP(timezone=True)) # TODO: Change these two types to time
time_update = Column(TIMESTAMP(timezone=True))
author_uid = Column(Integer, ForeignKey('users.uid'))
forum_fid = Column(Integer, ForeignKey('forums.fid'))
def __repr__(self):
return "<Thread %s>" % self.title
class User(Base):
__tablename__ = "users"
uid = Column(Integer, primary_key=True)
name = Column(String)
def __repr__(self):
return "<User %s>" % self.name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment