Skip to content

Instantly share code, notes, and snippets.

@turicas
Created November 10, 2013 00:50
Show Gist options
  • Save turicas/7392176 to your computer and use it in GitHub Desktop.
Save turicas/7392176 to your computer and use it in GitHub Desktop.
L² Hackathon WikiMedia
#!/usr/bin/env python
# coding: utf-8
import json
import requests
QUERY_URL = (u'http://pt.wikipedia.org/w/api.php?format=json&action=query'
u'&titles={}&prop=revisions&rvprop=content')
PARSE_URL = (u'http://pt.wikipedia.org/w/api.php?action=parse&format=json&'
u'text={}')
SECOES = [u'Geral', u'Combate ao vandalismo', u'Estatutos',
u'Construção de artigos', u'Comunidade']
def get_wikitext(page_title):
url = QUERY_URL.format(page_title)
result = requests.get(url)
data = result.json()
page_id = data['query']['pages'].keys()[0]
return data['query']['pages'][page_id]['revisions'][0]['*']
def parse_wikitext(wikitext):
url = PARSE_URL.format(wikitext)
result = requests.get(url)
data = result.json()
return data['parse']['text']['*']
def main():
page_title = u'Wikipédia:Arqueologia/Linha do tempo'
page_wikitext = get_wikitext(page_title)
partes = page_wikitext.split('== ')
for parte in partes[1:]:
linhas = parte.split('\n')
titulo = linhas[0].replace('==', '').strip()
if titulo not in SECOES:
continue
conteudo = u'\n'.join(linhas[1:]) + u'\n\n{{Referências}}'
#TODO: referências não estão funcionando
#TODO: .replace com os links
conteudo_html = parse_wikitext(conteudo)
print '-' * 80
print titulo
print '-' * 80
print conteudo
print '=' * 80
print conteudo_html
print ':' * 80
print
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment