dayllanmaza/ani-parser.py

## ani-parser.py
import os
import csv
import pprint
import re
import sys
import time
import calendar
import datetime
import requests

REGEX_CASE_RESOLVED = r'{{\s?(resolved|atop|archive\stop)'
REGEX_USER_TALK_DATES = r'\[{2}User[\s_]talk\:(?:[^(?:\]\]|\||#)])*[^\]\]]*\]\]\)?\s(\d{2}:\d{2},\s\d{2}\s\w+\s\d{4})'
REGEX_LINKS = r'((?:https|http):\/{2}(?:www\.)?[^]|\s]+)'
REGEX_SPECIAL_TEMPLATE = r'\[{2}(Special:[^\||\]]+)'
REGEX_WP_LINK = r'\[{2}((?:WP|Wikipedia):\s?[^\|\]\]]*)'


# noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/Archive'
noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/IncidentArchive'
base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=parse&page='

def main():

    path = os.path.dirname(os.path.abspath(__file__)) + '/'
    with open(path + 'ani_links.csv', 'w') as csvfile:
    #with open(path + 'an_links.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, dialect='excel')
        headers = ['title', 'url', 'resolved', 'open date', 'diff total', 'diff links',
        'special total', 'special links', 'other wiki total', 'other wiki', 'timeline total',
        'timeline links', 'toollabs total', 'toollabs links', 'others total', 'others links']
        writer.writerow(headers)

        for i in range(981, 992):
        # for i in range(293, 303):
            cases = fetch_archive_cases(i)
            if cases:
                writer.writerows(cases)


def fetch_archive_cases(index):
    page_url = base_url + noticeboard_page + str(index) + '&prop=sections'

    data = requests.get(page_url).json()
    print(page_url)
    sections = data['parse']['sections']

    cases = []
    for section in sections:

        if section['toclevel'] != 1: # we only care about top level sections
            continue

        if not section['index']:
            print('Could not parse: ' + section['anchor'])
            continue

        section_url = base_url + noticeboard_page + str(index) + '&prop=wikitext|externallinks|iwlinks' + '&section=' + section['index']
        data = fetch_case_data(section_url)

        wikitext = str(data['wikitext'])
        case_url = 'https://en.wikipedia.org/wiki/' + section['fromtitle'] + '#' + section['anchor']
        print(case_url)
        links = get_links(wikitext)
        case = [
            section['line'], # title
            case_url, # url
            is_case_resolved(wikitext), # resolved
            get_case_open_date(wikitext), # open date
            len(links['diff']), # diff total
            '\n'.join(links['diff']), # diff links
            len(links['special']), # special pages count
            '\n'.join(links['special']), # special pages links
            len(links['other_wiki']), # other wiki links count
            '\n'.join(links['other_wiki']), # other wiki links
            len(links['timeline']), # timeline count
            '\n'.join(links['timeline']), # timeline links
            len(links['toollabs']), # toollabs count
            '\n'.join(links['toollabs']), # toollabs links
            len(links['other']), # other links count
            '\n'.join(links['other']) # other links
        ]

        cases.append(case)

    return cases


def fetch_case_data(url):
    print(url)
    data = requests.get(url).json()
    return data['parse']


def is_case_resolved(case):
    return re.search(REGEX_CASE_RESOLVED, case, re.IGNORECASE) is not None


def get_case_open_date(case):
    matches = re.findall(REGEX_USER_TALK_DATES, case, re.IGNORECASE)

    if not matches:
        return 'Not found'

    dates = []
    for date in matches:
        ts = calendar.timegm(time.strptime(date, '%H:%M, %d %B %Y'))
        dates.append(ts)

    dates.sort()

    return datetime.datetime.fromtimestamp(dates[0]).isoformat()


def get_links(case):
    links = {
        'diff': [],
        'special': [],
        'other_wiki': [],
        'timeline': [],
        'toollabs': [],
        'other': []
    }
    # get urls from the case wikitext
    matches = re.findall(REGEX_LINKS, case)
    if matches:
        for link in matches:
            if re.search(r'(?:diff=|Special:Diff)', link, re.IGNORECASE):
                links['diff'].append(link)
            elif re.search(r'Special:[^Diff]', link, re.IGNORECASE):
                links['special'].append(link)
            elif re.search(r'wikipedia\.org.+[^(?:Special|diff)]', link, re.IGNORECASE):
                links['other_wiki'].append(link)
            elif link.find('/interaction-timeline') != -1:
                links['timeline'].append(link)
            elif link.find('tools.wmflabs.org') != -1:
                links['toollabs'].append(link)
            else:
                links['other'].append(link)

    # get template links
    matches = re.findall(REGEX_SPECIAL_TEMPLATE, case, re.IGNORECASE)
    if matches:
        for special_page in matches:
            link = 'https://en.wikipedia.org/wiki/' + special_page
            if special_page.find('Diff') != -1:
                links['diff'].append(link)
            else:
                links['special'].append(link)


    matches = re.findall(REGEX_WP_LINK, case, re.IGNORECASE)
    if matches:
        for wiki_page in matches:
            link = 'https://en.wikipedia.org/wiki/' + wiki_page
            links['other_wiki'].append(link)

    return links


if __name__ == '__main__':
    main()
	import os
	import csv
	import pprint
	import re
	import sys
	import time
	import calendar
	import datetime
	import requests

	REGEX_CASE_RESOLVED = r'{{\s?(resolved\|atop\|archive\stop)'
	REGEX_USER_TALK_DATES = r'\[{2}User[\s_]talk\:(?:[^(?:\]\]\|\\|\|#)])[^\]\]]\]\]\)?\s(\d{2}:\d{2},\s\d{2}\s\w+\s\d{4})'
	REGEX_LINKS = r'((?:https\|http):\/{2}(?:www\.)?[^]\|\s]+)'
	REGEX_SPECIAL_TEMPLATE = r'\[{2}(Special:[^\\|\|\]]+)'
	REGEX_WP_LINK = r'\[{2}((?:WP\|Wikipedia):\s?[^\\|\]\]]*)'


	# noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/Archive'
	noticeboard_page = 'Wikipedia:Administrators\'_noticeboard/IncidentArchive'
	base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=parse&page='

	def main():

	path = os.path.dirname(os.path.abspath(__file__)) + '/'
	with open(path + 'ani_links.csv', 'w') as csvfile:
	#with open(path + 'an_links.csv', 'w') as csvfile:
	writer = csv.writer(csvfile, dialect='excel')
	headers = ['title', 'url', 'resolved', 'open date', 'diff total', 'diff links',
	'special total', 'special links', 'other wiki total', 'other wiki', 'timeline total',
	'timeline links', 'toollabs total', 'toollabs links', 'others total', 'others links']
	writer.writerow(headers)

	for i in range(981, 992):
	# for i in range(293, 303):
	cases = fetch_archive_cases(i)
	if cases:
	writer.writerows(cases)


	def fetch_archive_cases(index):
	page_url = base_url + noticeboard_page + str(index) + '&prop=sections'

	data = requests.get(page_url).json()
	print(page_url)
	sections = data['parse']['sections']

	cases = []
	for section in sections:

	if section['toclevel'] != 1: # we only care about top level sections
	continue

	if not section['index']:
	print('Could not parse: ' + section['anchor'])
	continue

	section_url = base_url + noticeboard_page + str(index) + '&prop=wikitext\|externallinks\|iwlinks' + '&section=' + section['index']
	data = fetch_case_data(section_url)

	wikitext = str(data['wikitext'])
	case_url = 'https://en.wikipedia.org/wiki/' + section['fromtitle'] + '#' + section['anchor']
	print(case_url)
	links = get_links(wikitext)
	case = [
	section['line'], # title
	case_url, # url
	is_case_resolved(wikitext), # resolved
	get_case_open_date(wikitext), # open date
	len(links['diff']), # diff total
	'\n'.join(links['diff']), # diff links
	len(links['special']), # special pages count
	'\n'.join(links['special']), # special pages links
	len(links['other_wiki']), # other wiki links count
	'\n'.join(links['other_wiki']), # other wiki links
	len(links['timeline']), # timeline count
	'\n'.join(links['timeline']), # timeline links
	len(links['toollabs']), # toollabs count
	'\n'.join(links['toollabs']), # toollabs links
	len(links['other']), # other links count
	'\n'.join(links['other']) # other links
	]

	cases.append(case)

	return cases


	def fetch_case_data(url):
	print(url)
	data = requests.get(url).json()
	return data['parse']


	def is_case_resolved(case):
	return re.search(REGEX_CASE_RESOLVED, case, re.IGNORECASE) is not None


	def get_case_open_date(case):
	matches = re.findall(REGEX_USER_TALK_DATES, case, re.IGNORECASE)

	if not matches:
	return 'Not found'

	dates = []
	for date in matches:
	ts = calendar.timegm(time.strptime(date, '%H:%M, %d %B %Y'))
	dates.append(ts)

	dates.sort()

	return datetime.datetime.fromtimestamp(dates[0]).isoformat()


	def get_links(case):
	links = {
	'diff': [],
	'special': [],
	'other_wiki': [],
	'timeline': [],
	'toollabs': [],
	'other': []
	}
	# get urls from the case wikitext
	matches = re.findall(REGEX_LINKS, case)
	if matches:
	for link in matches:
	if re.search(r'(?:diff=\|Special:Diff)', link, re.IGNORECASE):
	links['diff'].append(link)
	elif re.search(r'Special:[^Diff]', link, re.IGNORECASE):
	links['special'].append(link)
	elif re.search(r'wikipedia\.org.+[^(?:Special\|diff)]', link, re.IGNORECASE):
	links['other_wiki'].append(link)
	elif link.find('/interaction-timeline') != -1:
	links['timeline'].append(link)
	elif link.find('tools.wmflabs.org') != -1:
	links['toollabs'].append(link)
	else:
	links['other'].append(link)

	# get template links
	matches = re.findall(REGEX_SPECIAL_TEMPLATE, case, re.IGNORECASE)
	if matches:
	for special_page in matches:
	link = 'https://en.wikipedia.org/wiki/' + special_page
	if special_page.find('Diff') != -1:
	links['diff'].append(link)
	else:
	links['special'].append(link)


	matches = re.findall(REGEX_WP_LINK, case, re.IGNORECASE)
	if matches:
	for wiki_page in matches:
	link = 'https://en.wikipedia.org/wiki/' + wiki_page
	links['other_wiki'].append(link)

	return links


	if __name__ == '__main__':
	main()