Skip to content

Instantly share code, notes, and snippets.

@dayllanmaza
Last active September 12, 2018 18:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dayllanmaza/6d9c55a80da837fc68a3a6af310c01a8 to your computer and use it in GitHub Desktop.
Save dayllanmaza/6d9c55a80da837fc68a3a6af310c01a8 to your computer and use it in GitHub Desktop.
import os
import csv
import pprint
import re
import sys
import time
import calendar
import datetime
import requests
REGEX_CASE_RESOLVED = r'{{\s?(resolved|atop|archive\stop)'
REGEX_USER_TALK_DATES = r'\[{2}User[\s_]talk\:(?:[^(?:\]\]|\||#)])*[^\]\]]*\]\]\)?\s(\d{2}:\d{2},\s\d{2}\s\w+\s\d{4})'
REGEX_LINKS = r'((?:https|http):\/{2}(?:www\.)?[^]|\s]+)'
REGEX_SPECIAL_TEMPLATE = r'\[{2}(Special:[^\||\]]+)'
REGEX_WP_LINK = r'\[{2}((?:WP|Wikipedia):\s?[^\|\]\]]*)'
# noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/Archive'
noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/IncidentArchive'
base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=parse&page='
def main():
path = os.path.dirname(os.path.abspath(__file__)) + '/'
with open(path + 'ani_links.csv', 'w') as csvfile:
#with open(path + 'an_links.csv', 'w') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
headers = ['title', 'url', 'resolved', 'open date', 'diff total', 'diff links',
'special total', 'special links', 'other wiki total', 'other wiki', 'timeline total',
'timeline links', 'toollabs total', 'toollabs links', 'others total', 'others links']
writer.writerow(headers)
for i in range(981, 992):
# for i in range(293, 303):
cases = fetch_archive_cases(i)
if cases:
writer.writerows(cases)
def fetch_archive_cases(index):
page_url = base_url + noticeboard_page + str(index) + '&prop=sections'
data = requests.get(page_url).json()
print(page_url)
sections = data['parse']['sections']
cases = []
for section in sections:
if section['toclevel'] != 1: # we only care about top level sections
continue
if not section['index']:
print('Could not parse: ' + section['anchor'])
continue
section_url = base_url + noticeboard_page + str(index) + '&prop=wikitext|externallinks|iwlinks' + '&section=' + section['index']
data = fetch_case_data(section_url)
wikitext = str(data['wikitext'])
case_url = 'https://en.wikipedia.org/wiki/' + section['fromtitle'] + '#' + section['anchor']
print(case_url)
links = get_links(wikitext)
case = [
section['line'], # title
case_url, # url
is_case_resolved(wikitext), # resolved
get_case_open_date(wikitext), # open date
len(links['diff']), # diff total
'\n'.join(links['diff']), # diff links
len(links['special']), # special pages count
'\n'.join(links['special']), # special pages links
len(links['other_wiki']), # other wiki links count
'\n'.join(links['other_wiki']), # other wiki links
len(links['timeline']), # timeline count
'\n'.join(links['timeline']), # timeline links
len(links['toollabs']), # toollabs count
'\n'.join(links['toollabs']), # toollabs links
len(links['other']), # other links count
'\n'.join(links['other']) # other links
]
cases.append(case)
return cases
def fetch_case_data(url):
print(url)
data = requests.get(url).json()
return data['parse']
def is_case_resolved(case):
return re.search(REGEX_CASE_RESOLVED, case, re.IGNORECASE) is not None
def get_case_open_date(case):
matches = re.findall(REGEX_USER_TALK_DATES, case, re.IGNORECASE)
if not matches:
return 'Not found'
dates = []
for date in matches:
ts = calendar.timegm(time.strptime(date, '%H:%M, %d %B %Y'))
dates.append(ts)
dates.sort()
return datetime.datetime.fromtimestamp(dates[0]).isoformat()
def get_links(case):
links = {
'diff': [],
'special': [],
'other_wiki': [],
'timeline': [],
'toollabs': [],
'other': []
}
# get urls from the case wikitext
matches = re.findall(REGEX_LINKS, case)
if matches:
for link in matches:
if re.search(r'(?:diff=|Special:Diff)', link, re.IGNORECASE):
links['diff'].append(link)
elif re.search(r'Special:[^Diff]', link, re.IGNORECASE):
links['special'].append(link)
elif re.search(r'wikipedia\.org.+[^(?:Special|diff)]', link, re.IGNORECASE):
links['other_wiki'].append(link)
elif link.find('/interaction-timeline') != -1:
links['timeline'].append(link)
elif link.find('tools.wmflabs.org') != -1:
links['toollabs'].append(link)
else:
links['other'].append(link)
# get template links
matches = re.findall(REGEX_SPECIAL_TEMPLATE, case, re.IGNORECASE)
if matches:
for special_page in matches:
link = 'https://en.wikipedia.org/wiki/' + special_page
if special_page.find('Diff') != -1:
links['diff'].append(link)
else:
links['special'].append(link)
matches = re.findall(REGEX_WP_LINK, case, re.IGNORECASE)
if matches:
for wiki_page in matches:
link = 'https://en.wikipedia.org/wiki/' + wiki_page
links['other_wiki'].append(link)
return links
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment