Skip to content

Instantly share code, notes, and snippets.

@sfan5 sfan5/mtforum-crawler.py
Last active Dec 11, 2015

Embed
What would you like to do?
Crawls the Minetest forum mod releases section
import urllib, urllib2, re
from xml.dom import minidom
# Made by sfan5 2013 v0.4.09.3
V = "0.4.09.3"
# this script runs through the 'Mod Releases' section and checks the topics for missing things
# Also displays Statistics
# Can also reply to topics when valid session cookies are provided
# Can also move faulty topics to another forum
## <Settings>
pages_total = 16
post_to_mod_topics = False
move_faulty_topics = False
crawl_section = "11" # section to crawl
to_forum = "9" # destination forum section id when moving
base_addr = "https://forum.minetest.net/"
hdrs_bot = {'Cookie': 'PHPSESSID=EXAMPLE; forum_cookie_31d6ec=EXAMPLE'} # bot account session cookies for posting posts
hdrs_mod = {'Cookie': 'PHPSESSID=EXAMPLE; forum_cookie_31d6ec=EXAMPLE'} # moderator account session cookies for moving topics
## </Settings>
stats = {}
stats["errors_on_forumpage"] = []
for i in range(0,pages_total+1): # Create Elements
stats["errors_on_forumpage"].append(0)
stats["checked_mods"] = 0
stats["mods_with_errs"] = 0
stats["total_errs"] = 0
stats["errcnt_topicformat"] = 0
stats["errcnt_license"] = 0
stats["errcnt_depends"] = 0
stats["errcnt_download"] = 0
stats["user_mods"] = {}
stats["user_mistakes"] = {}
def inc_user_mistakes(username):
try:
stats["user_mistakes"][username] += 1
except:
stats["user_mistakes"][username] = 1
def inc_user_mods(username):
try:
stats["user_mods"][username] += 1
except:
stats["user_mods"][username] = 1
def get_user_mistakes(username):
try:
return stats["user_mistakes"][username]
except:
return 0
def shift_fwd1(ar, upos):
for i in range(len(ar) - 1, upos, -1):
if i != 1:
ar[i] = ar[i-1]
def mk_tuple_list(ln, v):
l = list()
for i in range(0,ln):
l.append(("",v))
return l
def make_top(cn, d, dfunc, cmptype, sfmt, prntfunc):
if cmptype == "most":
places = mk_tuple_list(cn+1, 0.0)
elif cmptype == "least":
places = mk_tuple_list(cn+1, 9999.9)
else:
return
places[0] = -1.0
for e in d:
da = dfunc(e)
for x in range(1, cn+1):
if (da > places[x][1] and cmptype == "most") or (da < places[x][1] and cmptype == "least"):
shift_fwd1(places, x)
places[x] = (e, da)
break
for k in range(1,cn+1):
prntfunc(sfmt % (k, places[k][0], places[k][1]))
rform_user = re.compile(r'<input type="hidden" name="form_user" value="([^"]+)" />')
rcsrf_token = re.compile(r'<input type="hidden" name="csrf_token" value="([a-f0-9]+)" />')
rform_honey_key_id = re.compile(r'<input type="hidden" name="form_honey_key_id" value="([a-f0-9]*)" />')
rantispam_field = re.compile(r'<input type="text" id="fld1" name="([a-z0-9_]+)" size="[0-9]*" autocomplete="off" />')
fl = open("results.txt", 'w')
fl.write("%%%%%% Generated by mtforum-crawler.py v%s\n\n" % V)
def oprint(st):
if type(st) == type(u''): # Check if unicode
fst = st.encode('utf8')
else:
fst = st
fl.write(fst + "\n")
fl.flush()
print(st)
for i in range(1,pages_total+1):
oprint("##### Page " + str(i) + " #####")
f = urllib.urlopen(base_addr + "viewforum.php?id=" + crawl_section + "&p=" + str(i))
bytes = f.read()
shim = '<div id="forum11" class="main-content main-forum forum-views">'
shim2 = '<div class="main-foot">'
if shim in bytes and shim2 in bytes:
bytes = bytes.split(shim, 1).pop()
bytes = bytes.split(shim2, 1)[0]
bytes = "<div>" + bytes # Root Tag needed
dom = minidom.parseString(bytes)
l = dom.firstChild.getElementsByTagName("div")
for k in range(0,len(l),2):
eo = True
if 'Sticky' in l[k].childNodes[3].childNodes[1].toxml():
continue # Ignore Sticky Topics
topic_name = l[k].childNodes[3].childNodes[1].childNodes[-1].firstChild.data
topic_url = l[k].childNodes[3].childNodes[1].childNodes[-1].getAttribute("href")
made_by = l[k].childNodes[3].childNodes[3].childNodes[0].childNodes[-1].firstChild.data
results = ""
if not topic_name.lower().startswith("[mod]") and not topic_name.lower().startswith("[modpack]"):
results += "[EE] Topic doesn't begin with '[Mod]' or '[Modpack]'\n"
stats["errcnt_topicformat"] += 1
stats["total_errs"] += 1
inc_user_mistakes(made_by)
eo = False
f_ = urllib.urlopen(topic_url)
shim = '<div class="entry-content">'
shim2 = '<div class="sig-content">'
b = f_.read()
if shim in b and shim2 in b:
b = b.split(shim, 1).pop()
b = b.split(shim2, 1)[0]
b = "<div>" + b + "</div>"
else:
oprint("Invalid Data for Topic ''%s''!" % topic_url)
continue
if not "license" in b.lower():
results += "[EE] No License found\n"
stats["errcnt_license"] += 1
stats["total_errs"] += 1
inc_user_mistakes(made_by)
eo = False
if not "depends" in b.lower() and not "dependencies" in b.lower() and not "dependency" in b.lower() and not "dependence" in b.lower() and not "require" in b.lower():
results += "[EE] No Dependencies found\n"
stats["errcnt_depends"] += 1
stats["total_errs"] += 1
inc_user_mistakes(made_by)
eo = False
if not '<a href="' in b.lower():
results += "[EE] No Download found\n"
stats["errcnt_download"] += 1
stats["total_errs"] += 1
inc_user_mistakes(made_by)
eo = False
if not eo:
oprint("*** Page %i | '%s' by %s %s ***" % (i, topic_name, made_by, topic_url))
oprint(results)
if post_to_mod_topics:
request = urllib2.Request(topic_url.replace("viewtopic.php?id=","post.php?tid="), headers=hdrs_bot)
fre = urllib2.urlopen(request)
cont = fre.read()
form_user = rform_user.search(cont).group(1)
csrf_token = rcsrf_token.search(cont).group(1)
form_honey_key_id = rform_honey_key_id.search(cont).group(1)
antispam_field = rantispam_field.search(cont).group(1)
formdata = urllib.urlencode({'req_message': results + "\n\nPlease fix these Mistakes and report this post, a moderator will delete it\nIf you believe I have made a Mistake contact [url=http://forum.minetest.net/profile.php?id=239]sfan5[/url]",'form_sent': '1', 'form_user': form_user, 'pun_poll_block_open': '0', 'csrf_token': csrf_token, 'form_honey_key_id': form_honey_key_id, antispam_field:''})
request = urllib2.Request(url=topic_url.replace("viewtopic.php?id=","post.php?tid="), data=formdata, headers=hdrs_bot)
fre = urllib2.urlopen(request)
fre.read()
if move_faulty_topics:
request = urllib2.Request(topic_url.replace("viewtopic.php?id=","moderate.php?fid=" + crawl_section + "&move_topics="), headers=hdrs_mod)
fre = urllib2.urlopen(request)
cont = fre.read()
csrf_token = rcsrf_token.search(cont).group(1)
formdata = urllib.urlencode({'with_redirect': '0', 'csrf_token': csrf_token, 'move_to_forum': to_forum, 'topics':topic_url.split('?')[1].replace("id=", '')})
request = urllib2.Request(url=base_addr + "moderate.php?fid=" + crawl_section, data=formdata, headers=hdrs_bot)
fre = urllib2.urlopen(request)
fre.read()
stats["mods_with_errs"] += 1
stats["errors_on_forumpage"][i] += 1
stats["checked_mods"] += 1
inc_user_mods(made_by)
else:
oprint("Invalid Data for Page %i!" % i)
# Print Stats
oprint("#### Statistics ####")
oprint("\n") # Dbl. Sep.
oprint("Mistakes per Forum Page:")
pg_with_most_errs = -1
pg_with_most_errs_s = 0
for i in range(1, len(stats["errors_on_forumpage"])):
try:
oprint(" " + str(i) + " => " + str(stats["errors_on_forumpage"][i]))
if stats["errors_on_forumpage"][i] > pg_with_most_errs_s:
pg_with_most_errs_s = stats["errors_on_forumpage"][i]
pg_with_most_errs = i
except:
oprint(" " + str(i) + " => 0")
oprint("Page with most Errors is Page %i with %i Errors" % (pg_with_most_errs, pg_with_most_errs_s))
oprint("") # Sep.
oprint("Checked Topics: %i" % stats["checked_mods"])
oprint(" -> with Mistakes: %i" % stats["mods_with_errs"])
oprint(" -> without Mistakes: %i" % (stats["checked_mods"] - stats["mods_with_errs"]))
oprint("") # Sep.
oprint("Total Mistakes: %i" % stats["total_errs"])
oprint(" -> avg. Mistakes per Topic: %s" % str(stats["total_errs"]*1.0 / stats["checked_mods"]*1.0))
oprint(" -> avg. Mistakes per Topic with Mistake(s): %s" % str(stats["total_errs"]*1.0 / stats["mods_with_errs"]*1.0))
oprint(" -> avg. Mistakes per User: %s" % str(stats["total_errs"]*1.0 / len(stats["user_mods"].keys())*1.0))
oprint("") # Sep.
oprint("Mistake Count:")
oprint(" wrong Topictitle => %i" % stats["errcnt_topicformat"])
oprint(" License missing => %i" % stats["errcnt_license"])
oprint(" Dependencies missing => %i" % stats["errcnt_depends"])
oprint(" Download missing => %i" % stats["errcnt_download"])
oprint("") # Sep.
oprint("Top 15 of Mistakes/User:")
make_top(15, stats["user_mistakes"], get_user_mistakes, "most", " %i. %s %i Mistakes", oprint)
oprint("") # Sep.
oprint("Top 7 least mistakes/User:")
make_top(7, stats["user_mods"], get_user_mistakes, "least", " %i. %s %i Mistakes", oprint)
oprint("") # Sep.
oprint("Top 7 least Mistakes/User without 0-Mistake-Users:")
make_top(7, stats["user_mistakes"], get_user_mistakes, "least", " %i. %s %i Mistakes", oprint)
oprint("") # Sep.
oprint("Top 10 of best Mistake/Mods quota:")
quota_tbl = {}
for u in stats["user_mods"]:
m = get_user_mistakes(u)
quota_tbl[u] = m*1.0 / stats["user_mods"][u]*1.0
def r(n):
return quota_tbl[n]
make_top(10, quota_tbl, r, "least", " %i. %s %0.5f", oprint)
oprint("") # Sep.
oprint("Top 10 of best Mistake/Mods quota without 0-Mistake-Users:")
quota_tbl2 = {}
for u in stats["user_mistakes"]:
m = get_user_mistakes(u)
quota_tbl2[u] = m*1.0 / stats["user_mods"][u]*1.0
def r(n):
return quota_tbl2[n]
make_top(10, quota_tbl2, r, "least", " %i. %s %0.5f", oprint)
oprint("") # Sep.
oprint("Top 20 of Mods/User:")
def r(n):
return stats["user_mods"][n]
make_top(20, stats["user_mods"], r, "most", " %i. %s %i Mods", oprint)
oprint("") # Sep.
fl.close()
# Dump raw Statistics
print("Saving raw Statistics....")
f = open("stats.txt", 'w')
f.write("# Generated by mtforum-crawler.py v%s\n\n" % V)
f.write("stats = {}\n")
#stats["errors_on_forumpage"] (list)
f.write('stats["errors_on_forumpage"] = []\n')
f.write('stats["errors_on_forumpage"].append(-1)\n')
for i in range(1, len(stats["errors_on_forumpage"])):
f.write('stats["errors_on_forumpage"].append(' + str(stats["errors_on_forumpage"][i]) + ') # Page ' + str(i) + '\n')
#stats["checked_mods"] (int)
f.write('stats["checked_mods"] = ' + str(stats["checked_mods"]) + '\n')
#stats["mods_with_errs"] (int)
f.write('stats["mods_with_errs"] = ' + str(stats["mods_with_errs"]) + '\n')
#stats["total_errs"] (int)
f.write('stats["total_errs"] = ' + str(stats["total_errs"]) + '\n')
#stats["errcnt_topicformat"] (int)
f.write('stats["errcnt_topicformat"] = ' + str(stats["errcnt_topicformat"]) + '\n')
#stats["errcnt_license"] (int)
f.write('stats["errcnt_license"] = ' + str(stats["errcnt_license"]) + '\n')
#stats["errcnt_depends"] (int)
f.write('stats["errcnt_depends"] = ' + str(stats["errcnt_depends"]) + '\n')
#stats["errcnt_download"] (int)
f.write('stats["errcnt_download"] = ' + str(stats["errcnt_download"]) + '\n')
#stats["user_mods"] (dict)
f.write('stats["user_mods"] = {}\n')
for k in stats["user_mods"]:
f.write('stats["user_mods"]["' + k + '"] = ' + str(stats["user_mods"][k]) + '\n')
#stats["user_mistakes"] (dict)
f.write('stats["user_mistakes"] = {}\n')
for k in stats["user_mistakes"]:
f.write('stats["user_mistakes"]["' + k + '"] = ' + str(stats["user_mistakes"][k]) + '\n')
#quota_tbl (dict)
f.write('quota_tbl = {}\n')
for k in quota_tbl:
f.write('quota_tbl["' + k + '"] = ' + str(quota_tbl[k]) + '\n')
#quota_tbl2 (dict)
f.write('quota_tbl2 = {}\n')
for k in quota_tbl2:
f.write('quota_tbl2["' + k + '"] = ' + str(quota_tbl2[k]) + '\n')
print("done.")
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.