Last active
March 9, 2022 00:55
-
-
Save sfan5/4642453 to your computer and use it in GitHub Desktop.
Crawls the Minetest forum mod releases section
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, urllib2, re | |
from xml.dom import minidom | |
# Made by sfan5 2013 v0.4.09.3 | |
V = "0.4.09.3" | |
# this script runs through the 'Mod Releases' section and checks the topics for missing things | |
# Also displays Statistics | |
# Can also reply to topics when valid session cookies are provided | |
# Can also move faulty topics to another forum | |
## <Settings> | |
pages_total = 16 | |
post_to_mod_topics = False | |
move_faulty_topics = False | |
crawl_section = "11" # section to crawl | |
to_forum = "9" # destination forum section id when moving | |
base_addr = "https://forum.minetest.net/" | |
hdrs_bot = {'Cookie': 'PHPSESSID=EXAMPLE; forum_cookie_31d6ec=EXAMPLE'} # bot account session cookies for posting posts | |
hdrs_mod = {'Cookie': 'PHPSESSID=EXAMPLE; forum_cookie_31d6ec=EXAMPLE'} # moderator account session cookies for moving topics | |
## </Settings> | |
stats = {} | |
stats["errors_on_forumpage"] = [] | |
for i in range(0,pages_total+1): # Create Elements | |
stats["errors_on_forumpage"].append(0) | |
stats["checked_mods"] = 0 | |
stats["mods_with_errs"] = 0 | |
stats["total_errs"] = 0 | |
stats["errcnt_topicformat"] = 0 | |
stats["errcnt_license"] = 0 | |
stats["errcnt_depends"] = 0 | |
stats["errcnt_download"] = 0 | |
stats["user_mods"] = {} | |
stats["user_mistakes"] = {} | |
def inc_user_mistakes(username): | |
try: | |
stats["user_mistakes"][username] += 1 | |
except: | |
stats["user_mistakes"][username] = 1 | |
def inc_user_mods(username): | |
try: | |
stats["user_mods"][username] += 1 | |
except: | |
stats["user_mods"][username] = 1 | |
def get_user_mistakes(username): | |
try: | |
return stats["user_mistakes"][username] | |
except: | |
return 0 | |
def shift_fwd1(ar, upos): | |
for i in range(len(ar) - 1, upos, -1): | |
if i != 1: | |
ar[i] = ar[i-1] | |
def mk_tuple_list(ln, v): | |
l = list() | |
for i in range(0,ln): | |
l.append(("",v)) | |
return l | |
def make_top(cn, d, dfunc, cmptype, sfmt, prntfunc): | |
if cmptype == "most": | |
places = mk_tuple_list(cn+1, 0.0) | |
elif cmptype == "least": | |
places = mk_tuple_list(cn+1, 9999.9) | |
else: | |
return | |
places[0] = -1.0 | |
for e in d: | |
da = dfunc(e) | |
for x in range(1, cn+1): | |
if (da > places[x][1] and cmptype == "most") or (da < places[x][1] and cmptype == "least"): | |
shift_fwd1(places, x) | |
places[x] = (e, da) | |
break | |
for k in range(1,cn+1): | |
prntfunc(sfmt % (k, places[k][0], places[k][1])) | |
rform_user = re.compile(r'<input type="hidden" name="form_user" value="([^"]+)" />') | |
rcsrf_token = re.compile(r'<input type="hidden" name="csrf_token" value="([a-f0-9]+)" />') | |
rform_honey_key_id = re.compile(r'<input type="hidden" name="form_honey_key_id" value="([a-f0-9]*)" />') | |
rantispam_field = re.compile(r'<input type="text" id="fld1" name="([a-z0-9_]+)" size="[0-9]*" autocomplete="off" />') | |
fl = open("results.txt", 'w') | |
fl.write("%%%%%% Generated by mtforum-crawler.py v%s\n\n" % V) | |
def oprint(st): | |
if type(st) == type(u''): # Check if unicode | |
fst = st.encode('utf8') | |
else: | |
fst = st | |
fl.write(fst + "\n") | |
fl.flush() | |
print(st) | |
for i in range(1,pages_total+1): | |
oprint("##### Page " + str(i) + " #####") | |
f = urllib.urlopen(base_addr + "viewforum.php?id=" + crawl_section + "&p=" + str(i)) | |
bytes = f.read() | |
shim = '<div id="forum11" class="main-content main-forum forum-views">' | |
shim2 = '<div class="main-foot">' | |
if shim in bytes and shim2 in bytes: | |
bytes = bytes.split(shim, 1).pop() | |
bytes = bytes.split(shim2, 1)[0] | |
bytes = "<div>" + bytes # Root Tag needed | |
dom = minidom.parseString(bytes) | |
l = dom.firstChild.getElementsByTagName("div") | |
for k in range(0,len(l),2): | |
eo = True | |
if 'Sticky' in l[k].childNodes[3].childNodes[1].toxml(): | |
continue # Ignore Sticky Topics | |
topic_name = l[k].childNodes[3].childNodes[1].childNodes[-1].firstChild.data | |
topic_url = l[k].childNodes[3].childNodes[1].childNodes[-1].getAttribute("href") | |
made_by = l[k].childNodes[3].childNodes[3].childNodes[0].childNodes[-1].firstChild.data | |
results = "" | |
if not topic_name.lower().startswith("[mod]") and not topic_name.lower().startswith("[modpack]"): | |
results += "[EE] Topic doesn't begin with '[Mod]' or '[Modpack]'\n" | |
stats["errcnt_topicformat"] += 1 | |
stats["total_errs"] += 1 | |
inc_user_mistakes(made_by) | |
eo = False | |
f_ = urllib.urlopen(topic_url) | |
shim = '<div class="entry-content">' | |
shim2 = '<div class="sig-content">' | |
b = f_.read() | |
if shim in b and shim2 in b: | |
b = b.split(shim, 1).pop() | |
b = b.split(shim2, 1)[0] | |
b = "<div>" + b + "</div>" | |
else: | |
oprint("Invalid Data for Topic ''%s''!" % topic_url) | |
continue | |
if not "license" in b.lower(): | |
results += "[EE] No License found\n" | |
stats["errcnt_license"] += 1 | |
stats["total_errs"] += 1 | |
inc_user_mistakes(made_by) | |
eo = False | |
if not "depends" in b.lower() and not "dependencies" in b.lower() and not "dependency" in b.lower() and not "dependence" in b.lower() and not "require" in b.lower(): | |
results += "[EE] No Dependencies found\n" | |
stats["errcnt_depends"] += 1 | |
stats["total_errs"] += 1 | |
inc_user_mistakes(made_by) | |
eo = False | |
if not '<a href="' in b.lower(): | |
results += "[EE] No Download found\n" | |
stats["errcnt_download"] += 1 | |
stats["total_errs"] += 1 | |
inc_user_mistakes(made_by) | |
eo = False | |
if not eo: | |
oprint("*** Page %i | '%s' by %s %s ***" % (i, topic_name, made_by, topic_url)) | |
oprint(results) | |
if post_to_mod_topics: | |
request = urllib2.Request(topic_url.replace("viewtopic.php?id=","post.php?tid="), headers=hdrs_bot) | |
fre = urllib2.urlopen(request) | |
cont = fre.read() | |
form_user = rform_user.search(cont).group(1) | |
csrf_token = rcsrf_token.search(cont).group(1) | |
form_honey_key_id = rform_honey_key_id.search(cont).group(1) | |
antispam_field = rantispam_field.search(cont).group(1) | |
formdata = urllib.urlencode({'req_message': results + "\n\nPlease fix these Mistakes and report this post, a moderator will delete it\nIf you believe I have made a Mistake contact [url=http://forum.minetest.net/profile.php?id=239]sfan5[/url]",'form_sent': '1', 'form_user': form_user, 'pun_poll_block_open': '0', 'csrf_token': csrf_token, 'form_honey_key_id': form_honey_key_id, antispam_field:''}) | |
request = urllib2.Request(url=topic_url.replace("viewtopic.php?id=","post.php?tid="), data=formdata, headers=hdrs_bot) | |
fre = urllib2.urlopen(request) | |
fre.read() | |
if move_faulty_topics: | |
request = urllib2.Request(topic_url.replace("viewtopic.php?id=","moderate.php?fid=" + crawl_section + "&move_topics="), headers=hdrs_mod) | |
fre = urllib2.urlopen(request) | |
cont = fre.read() | |
csrf_token = rcsrf_token.search(cont).group(1) | |
formdata = urllib.urlencode({'with_redirect': '0', 'csrf_token': csrf_token, 'move_to_forum': to_forum, 'topics':topic_url.split('?')[1].replace("id=", '')}) | |
request = urllib2.Request(url=base_addr + "moderate.php?fid=" + crawl_section, data=formdata, headers=hdrs_bot) | |
fre = urllib2.urlopen(request) | |
fre.read() | |
stats["mods_with_errs"] += 1 | |
stats["errors_on_forumpage"][i] += 1 | |
stats["checked_mods"] += 1 | |
inc_user_mods(made_by) | |
else: | |
oprint("Invalid Data for Page %i!" % i) | |
# Print Stats | |
oprint("#### Statistics ####") | |
oprint("\n") # Dbl. Sep. | |
oprint("Mistakes per Forum Page:") | |
pg_with_most_errs = -1 | |
pg_with_most_errs_s = 0 | |
for i in range(1, len(stats["errors_on_forumpage"])): | |
try: | |
oprint(" " + str(i) + " => " + str(stats["errors_on_forumpage"][i])) | |
if stats["errors_on_forumpage"][i] > pg_with_most_errs_s: | |
pg_with_most_errs_s = stats["errors_on_forumpage"][i] | |
pg_with_most_errs = i | |
except: | |
oprint(" " + str(i) + " => 0") | |
oprint("Page with most Errors is Page %i with %i Errors" % (pg_with_most_errs, pg_with_most_errs_s)) | |
oprint("") # Sep. | |
oprint("Checked Topics: %i" % stats["checked_mods"]) | |
oprint(" -> with Mistakes: %i" % stats["mods_with_errs"]) | |
oprint(" -> without Mistakes: %i" % (stats["checked_mods"] - stats["mods_with_errs"])) | |
oprint("") # Sep. | |
oprint("Total Mistakes: %i" % stats["total_errs"]) | |
oprint(" -> avg. Mistakes per Topic: %s" % str(stats["total_errs"]*1.0 / stats["checked_mods"]*1.0)) | |
oprint(" -> avg. Mistakes per Topic with Mistake(s): %s" % str(stats["total_errs"]*1.0 / stats["mods_with_errs"]*1.0)) | |
oprint(" -> avg. Mistakes per User: %s" % str(stats["total_errs"]*1.0 / len(stats["user_mods"].keys())*1.0)) | |
oprint("") # Sep. | |
oprint("Mistake Count:") | |
oprint(" wrong Topictitle => %i" % stats["errcnt_topicformat"]) | |
oprint(" License missing => %i" % stats["errcnt_license"]) | |
oprint(" Dependencies missing => %i" % stats["errcnt_depends"]) | |
oprint(" Download missing => %i" % stats["errcnt_download"]) | |
oprint("") # Sep. | |
oprint("Top 15 of Mistakes/User:") | |
make_top(15, stats["user_mistakes"], get_user_mistakes, "most", " %i. %s %i Mistakes", oprint) | |
oprint("") # Sep. | |
oprint("Top 7 least mistakes/User:") | |
make_top(7, stats["user_mods"], get_user_mistakes, "least", " %i. %s %i Mistakes", oprint) | |
oprint("") # Sep. | |
oprint("Top 7 least Mistakes/User without 0-Mistake-Users:") | |
make_top(7, stats["user_mistakes"], get_user_mistakes, "least", " %i. %s %i Mistakes", oprint) | |
oprint("") # Sep. | |
oprint("Top 10 of best Mistake/Mods quota:") | |
quota_tbl = {} | |
for u in stats["user_mods"]: | |
m = get_user_mistakes(u) | |
quota_tbl[u] = m*1.0 / stats["user_mods"][u]*1.0 | |
def r(n): | |
return quota_tbl[n] | |
make_top(10, quota_tbl, r, "least", " %i. %s %0.5f", oprint) | |
oprint("") # Sep. | |
oprint("Top 10 of best Mistake/Mods quota without 0-Mistake-Users:") | |
quota_tbl2 = {} | |
for u in stats["user_mistakes"]: | |
m = get_user_mistakes(u) | |
quota_tbl2[u] = m*1.0 / stats["user_mods"][u]*1.0 | |
def r(n): | |
return quota_tbl2[n] | |
make_top(10, quota_tbl2, r, "least", " %i. %s %0.5f", oprint) | |
oprint("") # Sep. | |
oprint("Top 20 of Mods/User:") | |
def r(n): | |
return stats["user_mods"][n] | |
make_top(20, stats["user_mods"], r, "most", " %i. %s %i Mods", oprint) | |
oprint("") # Sep. | |
fl.close() | |
# Dump raw Statistics | |
print("Saving raw Statistics....") | |
f = open("stats.txt", 'w') | |
f.write("# Generated by mtforum-crawler.py v%s\n\n" % V) | |
f.write("stats = {}\n") | |
#stats["errors_on_forumpage"] (list) | |
f.write('stats["errors_on_forumpage"] = []\n') | |
f.write('stats["errors_on_forumpage"].append(-1)\n') | |
for i in range(1, len(stats["errors_on_forumpage"])): | |
f.write('stats["errors_on_forumpage"].append(' + str(stats["errors_on_forumpage"][i]) + ') # Page ' + str(i) + '\n') | |
#stats["checked_mods"] (int) | |
f.write('stats["checked_mods"] = ' + str(stats["checked_mods"]) + '\n') | |
#stats["mods_with_errs"] (int) | |
f.write('stats["mods_with_errs"] = ' + str(stats["mods_with_errs"]) + '\n') | |
#stats["total_errs"] (int) | |
f.write('stats["total_errs"] = ' + str(stats["total_errs"]) + '\n') | |
#stats["errcnt_topicformat"] (int) | |
f.write('stats["errcnt_topicformat"] = ' + str(stats["errcnt_topicformat"]) + '\n') | |
#stats["errcnt_license"] (int) | |
f.write('stats["errcnt_license"] = ' + str(stats["errcnt_license"]) + '\n') | |
#stats["errcnt_depends"] (int) | |
f.write('stats["errcnt_depends"] = ' + str(stats["errcnt_depends"]) + '\n') | |
#stats["errcnt_download"] (int) | |
f.write('stats["errcnt_download"] = ' + str(stats["errcnt_download"]) + '\n') | |
#stats["user_mods"] (dict) | |
f.write('stats["user_mods"] = {}\n') | |
for k in stats["user_mods"]: | |
f.write('stats["user_mods"]["' + k + '"] = ' + str(stats["user_mods"][k]) + '\n') | |
#stats["user_mistakes"] (dict) | |
f.write('stats["user_mistakes"] = {}\n') | |
for k in stats["user_mistakes"]: | |
f.write('stats["user_mistakes"]["' + k + '"] = ' + str(stats["user_mistakes"][k]) + '\n') | |
#quota_tbl (dict) | |
f.write('quota_tbl = {}\n') | |
for k in quota_tbl: | |
f.write('quota_tbl["' + k + '"] = ' + str(quota_tbl[k]) + '\n') | |
#quota_tbl2 (dict) | |
f.write('quota_tbl2 = {}\n') | |
for k in quota_tbl2: | |
f.write('quota_tbl2["' + k + '"] = ' + str(quota_tbl2[k]) + '\n') | |
print("done.") | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment