Skip to content

Instantly share code, notes, and snippets.

@JustAnotherArchivist
Created July 25, 2017 17:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JustAnotherArchivist/85ef5c0e9d874791ee485fa69d08ac62 to your computer and use it in GitHub Desktop.
Save JustAnotherArchivist/85ef5c0e9d874791ee485fa69d08ac62 to your computer and use it in GitHub Desktop.
Roblox forum index grab
import html5lib
import logging
import re
import urllib.parse
forumUrlPrefix = 'https://forum.roblox.com/Forum/ShowForum.aspx?ForumID='
forumUrlPrefixLength = len(forumUrlPrefix)
postUrlPrefix = 'https://forum.roblox.com/Forum/ShowPost.aspx?PostID='
postUrlPrefixLength = len(postUrlPrefix)
postUrlPagePattern = re.compile(r'^\d+&PageIndex=\d+$')
pagerIdPrefix = 'ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
pagerIdPrefixLength = len(pagerIdPrefix)
pagerLinkSearchString = '<a id="ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page'
pagerLinkSearchStringLength = len(pagerLinkSearchString)
threadHrefSearchString = '<a class="post-list-subject" href="/Forum/ShowPost.aspx?PostID='
threadHrefSearchStringLength = len(threadHrefSearchString)
_logger = logging.getLogger('wpull.plugin.roblox-forums-indices')
fpThreadList = open('threads.txt', 'w')
# Page ID is zero-based and used e.g. in the pager links' id attribute. Page index is one-based and used e.g. in the URLs as the PageIndex parameter.
# https://stackoverflow.com/a/4665027
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub)
def get_urls(filename, urlInfo, documentInfo):
result = []
# Load the request body if we might need it
if urlInfo['url'].startswith(forumUrlPrefix):
with open(documentInfo['filename'], 'rb') as fp:
body = fp.read().decode('utf-8', 'surrogateescape')
# Add later pages of the forum listing
if urlInfo['url'].startswith(forumUrlPrefix) and urlInfo['url'][forumUrlPrefixLength:].isdigit():
pagerOffsets = list(find_all(body, '<span id="ctl00_cphRoblox_ThreadView1_ctl00_Pager"><table'))
if len(pagerOffsets) != 1:
_logger.warn('Roblox hook ‘{}’: searching for the pager did not yield exactly one hit. Ignoring.'.format(urlInfo['url']))
else:
pagerEndOffset = body.find('</table></span>', pagerOffsets[0])
pager = body[pagerOffsets[0]:pagerEndOffset]
maxPageId = 0
linkOffsets = list(find_all(pager, pagerLinkSearchString))
for linkOffset in linkOffsets:
page = pager[linkOffset + pagerLinkSearchStringLength:pager.find('"', linkOffset + 10)]
if page.isdigit():
maxPageId = max(maxPageId, int(page))
for pageIndex in range(2, maxPageId + 1):
result.append({'url': urlInfo['url'] + '&PageIndex=' + str(pageIndex)})
# Find threads and their page count; write them to fpThreadList
if urlInfo['url'].startswith(forumUrlPrefix):
threadRowOffsets = list(find_all(body, '<tr class="forum-table-row">'))
for threadRowOffset in threadRowOffsets:
threadRowEndOffset = body.find('</tr>', threadRowOffset)
threadRow = body[threadRowOffset:threadRowEndOffset]
threadHrefOffsets = list(find_all(threadRow, threadHrefSearchString))
if len(threadHrefOffsets) != 1:
_logger.warn('Roblox hook ‘{}’ offset {}: searching for the thread link did not yield exactly one hit. Ignoring.'.format(urlInfo['url'], threadRowOffset))
else:
threadId = threadRow[threadHrefOffsets[0] + threadHrefSearchStringLength:threadRow.find('"', threadHrefOffsets[0] + threadHrefSearchStringLength)]
threadPageLinkSearchString = '<a class="linkSmall" href="/Forum/ShowPost.aspx?PostID=' + threadId + '&amp;PageIndex='
threadPageLinkSearchStringLength = len(threadPageLinkSearchString)
threadPageLinkOffsets = list(find_all(threadRow, threadPageLinkSearchString))
maxPageIndex = 0
for threadPageLinkOffset in threadPageLinkOffsets:
pageIndex = threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]
if not pageIndex.isdigit():
_logger.warn('Roblox hook ‘{}’ offsets {}/{}: page index is not numeric. Ignoring.'.format(urlInfo['url'], threadRowOffset, threadPageLinkOffset))
else:
maxPageIndex = max(maxPageIndex, int(threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)]))
fpThreadList.write(str(threadId) + ' ' + str(maxPageIndex) + '\n')
return result
wpull_hook.callbacks.get_urls = get_urls
import wpull.database.sqltable
# From ludios's grab-site, libgrabsite/plugin.py, commit 7a63a3dc
class NoFsyncSQLTable(wpull.database.sqltable.SQLiteURLTable):
@classmethod
def _apply_pragmas_callback(cls, connection, record):
super()._apply_pragmas_callback(connection, record)
print('Applying PRAGMA')
connection.execute('PRAGMA synchronous=OFF')
wpull_plugin.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
#!/bin/bash
TZ=UTC ~/.local/bin/wpull \
'https://forum.roblox.com/Forum/' \
--warc-append --warc-file roblox-forums-indices --warc-max-size 2147483648 \
--recursive --level inf \
--no-parent \
--no-robots \
--no-check-certificate \
--user-agent 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' \
--concurrent 4 \
--waitretry 600 \
--escaped-fragment \
--strip-session-id \
--tries 3 \
--retry-connrefused \
--retry-dns-error \
--timeout 900 \
--session-timeout 21600 \
--delete-after \
--database roblox-forums-indices.db \
--no-verbose \
--output-file roblox-forums-indices.log \
--python-script hook.py \
--plugin-script plugin.py \
--accept-regex '^https://forum\.roblox\.com/Forum/(ShowForumGroup\.aspx\?ForumGroupID=\d+|ShowForum\.aspx\?ForumID=\d+(&PageIndex=\d+)?)?$'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment