-
-
Save JustAnotherArchivist/85ef5c0e9d874791ee485fa69d08ac62 to your computer and use it in GitHub Desktop.
Roblox forum index grab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html5lib | |
import logging | |
import re | |
import urllib.parse | |
forumUrlPrefix = 'https://forum.roblox.com/Forum/ShowForum.aspx?ForumID=' | |
forumUrlPrefixLength = len(forumUrlPrefix) | |
postUrlPrefix = 'https://forum.roblox.com/Forum/ShowPost.aspx?PostID=' | |
postUrlPrefixLength = len(postUrlPrefix) | |
postUrlPagePattern = re.compile(r'^\d+&PageIndex=\d+$') | |
pagerIdPrefix = 'ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page' | |
pagerIdPrefixLength = len(pagerIdPrefix) | |
pagerLinkSearchString = '<a id="ctl00_cphRoblox_ThreadView1_ctl00_Pager_Page' | |
pagerLinkSearchStringLength = len(pagerLinkSearchString) | |
threadHrefSearchString = '<a class="post-list-subject" href="/Forum/ShowPost.aspx?PostID=' | |
threadHrefSearchStringLength = len(threadHrefSearchString) | |
_logger = logging.getLogger('wpull.plugin.roblox-forums-indices') | |
fpThreadList = open('threads.txt', 'w') | |
# Page ID is zero-based and used e.g. in the pager links' id attribute. Page index is one-based and used e.g. in the URLs as the PageIndex parameter. | |
# https://stackoverflow.com/a/4665027 | |
def find_all(a_str, sub): | |
start = 0 | |
while True: | |
start = a_str.find(sub, start) | |
if start == -1: | |
return | |
yield start | |
start += len(sub) | |
def get_urls(filename, urlInfo, documentInfo): | |
result = [] | |
# Load the request body if we might need it | |
if urlInfo['url'].startswith(forumUrlPrefix): | |
with open(documentInfo['filename'], 'rb') as fp: | |
body = fp.read().decode('utf-8', 'surrogateescape') | |
# Add later pages of the forum listing | |
if urlInfo['url'].startswith(forumUrlPrefix) and urlInfo['url'][forumUrlPrefixLength:].isdigit(): | |
pagerOffsets = list(find_all(body, '<span id="ctl00_cphRoblox_ThreadView1_ctl00_Pager"><table')) | |
if len(pagerOffsets) != 1: | |
_logger.warn('Roblox hook ‘{}’: searching for the pager did not yield exactly one hit. Ignoring.'.format(urlInfo['url'])) | |
else: | |
pagerEndOffset = body.find('</table></span>', pagerOffsets[0]) | |
pager = body[pagerOffsets[0]:pagerEndOffset] | |
maxPageId = 0 | |
linkOffsets = list(find_all(pager, pagerLinkSearchString)) | |
for linkOffset in linkOffsets: | |
page = pager[linkOffset + pagerLinkSearchStringLength:pager.find('"', linkOffset + 10)] | |
if page.isdigit(): | |
maxPageId = max(maxPageId, int(page)) | |
for pageIndex in range(2, maxPageId + 1): | |
result.append({'url': urlInfo['url'] + '&PageIndex=' + str(pageIndex)}) | |
# Find threads and their page count; write them to fpThreadList | |
if urlInfo['url'].startswith(forumUrlPrefix): | |
threadRowOffsets = list(find_all(body, '<tr class="forum-table-row">')) | |
for threadRowOffset in threadRowOffsets: | |
threadRowEndOffset = body.find('</tr>', threadRowOffset) | |
threadRow = body[threadRowOffset:threadRowEndOffset] | |
threadHrefOffsets = list(find_all(threadRow, threadHrefSearchString)) | |
if len(threadHrefOffsets) != 1: | |
_logger.warn('Roblox hook ‘{}’ offset {}: searching for the thread link did not yield exactly one hit. Ignoring.'.format(urlInfo['url'], threadRowOffset)) | |
else: | |
threadId = threadRow[threadHrefOffsets[0] + threadHrefSearchStringLength:threadRow.find('"', threadHrefOffsets[0] + threadHrefSearchStringLength)] | |
threadPageLinkSearchString = '<a class="linkSmall" href="/Forum/ShowPost.aspx?PostID=' + threadId + '&PageIndex=' | |
threadPageLinkSearchStringLength = len(threadPageLinkSearchString) | |
threadPageLinkOffsets = list(find_all(threadRow, threadPageLinkSearchString)) | |
maxPageIndex = 0 | |
for threadPageLinkOffset in threadPageLinkOffsets: | |
pageIndex = threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)] | |
if not pageIndex.isdigit(): | |
_logger.warn('Roblox hook ‘{}’ offsets {}/{}: page index is not numeric. Ignoring.'.format(urlInfo['url'], threadRowOffset, threadPageLinkOffset)) | |
else: | |
maxPageIndex = max(maxPageIndex, int(threadRow[threadPageLinkOffset + threadPageLinkSearchStringLength:threadRow.find('"', threadPageLinkOffset + threadPageLinkSearchStringLength)])) | |
fpThreadList.write(str(threadId) + ' ' + str(maxPageIndex) + '\n') | |
return result | |
wpull_hook.callbacks.get_urls = get_urls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wpull.database.sqltable | |
# From ludios's grab-site, libgrabsite/plugin.py, commit 7a63a3dc | |
class NoFsyncSQLTable(wpull.database.sqltable.SQLiteURLTable): | |
@classmethod | |
def _apply_pragmas_callback(cls, connection, record): | |
super()._apply_pragmas_callback(connection, record) | |
print('Applying PRAGMA') | |
connection.execute('PRAGMA synchronous=OFF') | |
wpull_plugin.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
TZ=UTC ~/.local/bin/wpull \ | |
'https://forum.roblox.com/Forum/' \ | |
--warc-append --warc-file roblox-forums-indices --warc-max-size 2147483648 \ | |
--recursive --level inf \ | |
--no-parent \ | |
--no-robots \ | |
--no-check-certificate \ | |
--user-agent 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' \ | |
--concurrent 4 \ | |
--waitretry 600 \ | |
--escaped-fragment \ | |
--strip-session-id \ | |
--tries 3 \ | |
--retry-connrefused \ | |
--retry-dns-error \ | |
--timeout 900 \ | |
--session-timeout 21600 \ | |
--delete-after \ | |
--database roblox-forums-indices.db \ | |
--no-verbose \ | |
--output-file roblox-forums-indices.log \ | |
--python-script hook.py \ | |
--plugin-script plugin.py \ | |
--accept-regex '^https://forum\.roblox\.com/Forum/(ShowForumGroup\.aspx\?ForumGroupID=\d+|ShowForum\.aspx\?ForumID=\d+(&PageIndex=\d+)?)?$' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment