JustAnotherArchivist/hook.py Secret

## hook.py
import html5lib
import logging
import re
import urllib.parse


forumUrlPrefix = 'https://forum.roblox.com/Forum/ShowForum.aspx?ForumID='
forumUrlPrefixLength = len(forumUrlPrefix)
postUrlPrefix = 'https://forum.roblox.com/Forum/ShowPost.aspx?PostID='
postUrlPrefixLength = len(postUrlPrefix)
postUrlPagePattern = re.compile(r'^\d+&PageIndex=\d+$')
pagerIdPrefix = 'ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
pagerIdPrefixLength = len(pagerIdPrefix)
pagerLinkSearchString = '<a id="ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
pagerLinkSearchStringLength = len(pagerLinkSearchString)
threadHrefSearchString = '<a class="post-list-subject" href="/Forum/ShowPost.aspx?PostID='
threadHrefSearchStringLength = len(threadHrefSearchString)
_logger = logging.getLogger('wpull.plugin.roblox-forums-indices')
fpThreadList = open('threads.txt', 'w')


# Page ID is zero-based and used e.g. in the pager links' id attribute. Page index is one-based and used e.g. in the URLs as the PageIndex parameter.


# https://stackoverflow.com/a/4665027
def find_all(a_str, sub):
	start = 0
	while True:
		start = a_str.find(sub, start)
		if start == -1:
			return
		yield start
		start += len(sub)


def get_urls(filename, urlInfo, documentInfo):
	result = []

	# Load the request body if we might need it
	if urlInfo['url'].startswith(forumUrlPrefix):
		with open(documentInfo['filename'], 'rb') as fp:
			body = fp.read().decode('utf-8', 'surrogateescape')

	# Add later pages of the forum listing
	if urlInfo['url'].startswith(forumUrlPrefix) and urlInfo['url'][forumUrlPrefixLength:].isdigit():
		pagerOffsets = list(find_all(body, '<span id="ctl00_cphRoblox_ThreadView1_ctl00_Pager"><table'))
		if len(pagerOffsets) != 1:
			_logger.warn('Roblox hook ‘{}’: searching for the pager did not yield exactly one hit. Ignoring.'.format(urlInfo['url']))
		else:
			pagerEndOffset = body.find('</table></span>', pagerOffsets[0])
			pager = body[pagerOffsets[0]:pagerEndOffset]
			maxPageId = 0
			linkOffsets = list(find_all(pager, pagerLinkSearchString))
			for linkOffset in linkOffsets:
				page = pager[linkOffset + pagerLinkSearchStringLength:pager.find('"', linkOffset + 10)]
				if page.isdigit():
					maxPageId = max(maxPageId, int(page))
			for pageIndex in range(2, maxPageId + 1):
				result.append({'url': urlInfo['url'] + '&PageIndex=' + str(pageIndex)})

	# Find threads and their page count; write them to fpThreadList
	if urlInfo['url'].startswith(forumUrlPrefix):
		threadRowOffsets = list(find_all(body, '<tr class="forum-table-row">'))
		for threadRowOffset in threadRowOffsets:
			threadRowEndOffset = body.find('</tr>', threadRowOffset)
			threadRow = body[threadRowOffset:threadRowEndOffset]
			threadHrefOffsets = list(find_all(threadRow, threadHrefSearchString))
			if len(threadHrefOffsets) != 1:
				_logger.warn('Roblox hook ‘{}’ offset {}: searching for the thread link did not yield exactly one hit. Ignoring.'.format(urlInfo['url'], threadRowOffset))
			else:
				threadId = threadRow[threadHrefOffsets[0] + threadHrefSearchStringLength:threadRow.find('"', threadHrefOffsets[0] + threadHrefSearchStringLength)]
				threadPageLinkSearchString = '<a class="linkSmall" href="/Forum/ShowPost.aspx?PostID=' + threadId + '&amp;PageIndex='
				threadPageLinkSearchStringLength = len(threadPageLinkSearchString)
				threadPageLinkOffsets = list(find_all(threadRow, threadPageLinkSearchString))
				maxPageIndex = 0
				for threadPageLinkOffset in threadPageLinkOffsets:
					pageIndex = threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]
					if not pageIndex.isdigit():
						_logger.warn('Roblox hook ‘{}’ offsets {}/{}: page index is not numeric. Ignoring.'.format(urlInfo['url'], threadRowOffset, threadPageLinkOffset))
					else:
						maxPageIndex = max(maxPageIndex, int(threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]))
				fpThreadList.write(str(threadId) + ' ' + str(maxPageIndex) + '\n')

	return result


wpull_hook.callbacks.get_urls = get_urls

## plugin.py
import wpull.database.sqltable


# From ludios's grab-site, libgrabsite/plugin.py, commit 7a63a3dc
class NoFsyncSQLTable(wpull.database.sqltable.SQLiteURLTable):
	@classmethod
	def _apply_pragmas_callback(cls, connection, record):
		super()._apply_pragmas_callback(connection, record)
		print('Applying PRAGMA')
		connection.execute('PRAGMA synchronous=OFF')


wpull_plugin.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable

## run.sh
#!/bin/bash
TZ=UTC ~/.local/bin/wpull \
	'https://forum.roblox.com/Forum/' \
	--warc-append --warc-file roblox-forums-indices --warc-max-size 2147483648 \
	--recursive --level inf \
	--no-parent \
	--no-robots \
	--no-check-certificate \
	--user-agent 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' \
	--concurrent 4 \
	--waitretry 600 \
	--escaped-fragment \
	--strip-session-id \
	--tries 3 \
	--retry-connrefused \
	--retry-dns-error \
	--timeout 900 \
	--session-timeout 21600 \
	--delete-after \
	--database roblox-forums-indices.db \
	--no-verbose \
	--output-file roblox-forums-indices.log \
	--python-script hook.py \
	--plugin-script plugin.py \
	--accept-regex '^https://forum\.roblox\.com/Forum/(ShowForumGroup\.aspx\?ForumGroupID=\d+|ShowForum\.aspx\?ForumID=\d+(&PageIndex=\d+)?)?$'
	import html5lib
	import logging
	import re
	import urllib.parse


	forumUrlPrefix = 'https://forum.roblox.com/Forum/ShowForum.aspx?ForumID='
	forumUrlPrefixLength = len(forumUrlPrefix)
	postUrlPrefix = 'https://forum.roblox.com/Forum/ShowPost.aspx?PostID='
	postUrlPrefixLength = len(postUrlPrefix)
	postUrlPagePattern = re.compile(r'^\d+&PageIndex=\d+$')
	pagerIdPrefix = 'ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
	pagerIdPrefixLength = len(pagerIdPrefix)
	pagerLinkSearchString = '<a id="ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
	pagerLinkSearchStringLength = len(pagerLinkSearchString)
	threadHrefSearchString = '<a class="post-list-subject" href="/Forum/ShowPost.aspx?PostID='
	threadHrefSearchStringLength = len(threadHrefSearchString)
	_logger = logging.getLogger('wpull.plugin.roblox-forums-indices')
	fpThreadList = open('threads.txt', 'w')


	# Page ID is zero-based and used e.g. in the pager links' id attribute. Page index is one-based and used e.g. in the URLs as the PageIndex parameter.


	# https://stackoverflow.com/a/4665027
	def find_all(a_str, sub):
	start = 0
	while True:
	start = a_str.find(sub, start)
	if start == -1:
	return
	yield start
	start += len(sub)


	def get_urls(filename, urlInfo, documentInfo):
	result = []

	# Load the request body if we might need it
	if urlInfo['url'].startswith(forumUrlPrefix):
	with open(documentInfo['filename'], 'rb') as fp:
	body = fp.read().decode('utf-8', 'surrogateescape')

	# Add later pages of the forum listing
	if urlInfo['url'].startswith(forumUrlPrefix) and urlInfo['url'][forumUrlPrefixLength:].isdigit():
	pagerOffsets = list(find_all(body, '<span id="ctl00_cphRoblox_ThreadView1_ctl00_Pager"><table'))
	if len(pagerOffsets) != 1:
	_logger.warn('Roblox hook ‘{}’: searching for the pager did not yield exactly one hit. Ignoring.'.format(urlInfo['url']))
	else:
	pagerEndOffset = body.find('</table></span>', pagerOffsets[0])
	pager = body[pagerOffsets[0]:pagerEndOffset]
	maxPageId = 0
	linkOffsets = list(find_all(pager, pagerLinkSearchString))
	for linkOffset in linkOffsets:
	page = pager[linkOffset + pagerLinkSearchStringLength:pager.find('"', linkOffset + 10)]
	if page.isdigit():
	maxPageId = max(maxPageId, int(page))
	for pageIndex in range(2, maxPageId + 1):
	result.append({'url': urlInfo['url'] + '&PageIndex=' + str(pageIndex)})

	# Find threads and their page count; write them to fpThreadList
	if urlInfo['url'].startswith(forumUrlPrefix):
	threadRowOffsets = list(find_all(body, '<tr class="forum-table-row">'))
	for threadRowOffset in threadRowOffsets:
	threadRowEndOffset = body.find('</tr>', threadRowOffset)
	threadRow = body[threadRowOffset:threadRowEndOffset]
	threadHrefOffsets = list(find_all(threadRow, threadHrefSearchString))
	if len(threadHrefOffsets) != 1:
	_logger.warn('Roblox hook ‘{}’ offset {}: searching for the thread link did not yield exactly one hit. Ignoring.'.format(urlInfo['url'], threadRowOffset))
	else:
	threadId = threadRow[threadHrefOffsets[0] + threadHrefSearchStringLength:threadRow.find('"', threadHrefOffsets[0] + threadHrefSearchStringLength)]
	threadPageLinkSearchString = '<a class="linkSmall" href="/Forum/ShowPost.aspx?PostID=' + threadId + '&PageIndex='
	threadPageLinkSearchStringLength = len(threadPageLinkSearchString)
	threadPageLinkOffsets = list(find_all(threadRow, threadPageLinkSearchString))
	maxPageIndex = 0
	for threadPageLinkOffset in threadPageLinkOffsets:
	pageIndex = threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]
	if not pageIndex.isdigit():
	_logger.warn('Roblox hook ‘{}’ offsets {}/{}: page index is not numeric. Ignoring.'.format(urlInfo['url'], threadRowOffset, threadPageLinkOffset))
	else:
	maxPageIndex = max(maxPageIndex, int(threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]))
	fpThreadList.write(str(threadId) + ' ' + str(maxPageIndex) + '\n')

	return result


	wpull_hook.callbacks.get_urls = get_urls
	import wpull.database.sqltable


	# From ludios's grab-site, libgrabsite/plugin.py, commit 7a63a3dc
	class NoFsyncSQLTable(wpull.database.sqltable.SQLiteURLTable):
	@classmethod
	def _apply_pragmas_callback(cls, connection, record):
	super()._apply_pragmas_callback(connection, record)
	print('Applying PRAGMA')
	connection.execute('PRAGMA synchronous=OFF')


	wpull_plugin.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
	#!/bin/bash
	TZ=UTC ~/.local/bin/wpull \
	'https://forum.roblox.com/Forum/' \
	--warc-append --warc-file roblox-forums-indices --warc-max-size 2147483648 \
	--recursive --level inf \
	--no-parent \
	--no-robots \
	--no-check-certificate \
	--user-agent 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' \
	--concurrent 4 \
	--waitretry 600 \
	--escaped-fragment \
	--strip-session-id \
	--tries 3 \
	--retry-connrefused \
	--retry-dns-error \
	--timeout 900 \
	--session-timeout 21600 \
	--delete-after \
	--database roblox-forums-indices.db \
	--no-verbose \
	--output-file roblox-forums-indices.log \
	--python-script hook.py \
	--plugin-script plugin.py \
	--accept-regex '^https://forum\.roblox\.com/Forum/(ShowForumGroup\.aspx\?ForumGroupID=\d+\|ShowForum\.aspx\?ForumID=\d+(&PageIndex=\d+)?)?$'