Skip to content

Instantly share code, notes, and snippets.

@Facenapalm
Created March 13, 2023 21:47
Show Gist options
  • Save Facenapalm/4559e733848e63d459afb5a2e574dfe5 to your computer and use it in GitHub Desktop.
Save Facenapalm/4559e733848e63d459afb5a2e574dfe5 to your computer and use it in GitHub Desktop.
Russian Wikipedia's Non-free content criteria (Критерии добросовестного использования, КДИ) violations finder
import re
import json
import requests
def normalize(name):
name = name.replace('_', ' ').strip()
name = re.sub(r"\s+", " ", name)
name = name[0].upper() + name[1:]
return name
def main():
arguments = {
'action': 'query',
'prop': 'fileusage|revisions',
'rvprop': 'content',
'rvslots': 'main',
'generator': 'categorymembers',
'gcmtitle': 'Категория:Файлы:Несвободные',
'gcmtype': 'file',
'format': 'json',
'formatversion': '2',
}
last_continue = {}
database = {}
while True:
request = arguments.copy()
request.update(last_continue)
response = requests.get('https://ru.wikipedia.org/w/api.php', params=request).json()
if 'error' in response:
raise Error(response['error'])
if 'warnings' in response:
print(response['warnings'])
if 'query' in response:
for pageinfo in response['query']['pages']:
pagename = pageinfo['title']
if pagename not in database:
database[pagename] = {
'usage': [],
'licenses': [],
}
if 'fileusage' in pageinfo:
database[pagename]['usage'] = sorted(database[pagename]['usage'] + [data['title'] for data in pageinfo['fileusage']])
if 'revisions' in pageinfo:
text = pageinfo['revisions'][0]['slots']['main']['content']
database[pagename]['licenses'] = sorted(normalize(name) for name in re.findall(r'\{\{(?:[Оо]боснование добросовестного использования|[Нн]есвободный файл/ОДИ|ОДИ)[^\}]*\|\s*статья\s*=\s*([^\}]+?)\s*[\|\}]', text))
if 'continue' not in response:
break
last_continue = response['continue']
print(f'{len(database)} files processed')
with open('dump.json', 'w', encoding='utf-8') as dump:
dump.write(json.dumps(database, indent=4, ensure_ascii=False))
# with open("dump.json", encoding='utf-8') as dump:
# database = json.loads(dump.read())
with open('result.wiki', 'w', encoding='utf-8') as output:
output.write('{| class="wikitable"\n! Файл !! Лицензии !! Использование')
for filename, fileinfo in database.items():
usage = fileinfo['usage']
licenses = fileinfo['licenses']
if usage == licenses:
continue
# <temp>
if usage == []:
continue
if licenses == []:
continue
if len(usage) <= len(licenses):
continue
# </temp>
licenses_list = ", ".join(f'<span style="background-color:#FFB080">[[{page}]]</span>' if page not in usage else f'[[{page}]]' for page in licenses)
usage_list = ", ".join(f'<span style="background-color:#FFB080">[[{page}]]</span>' if page not in licenses else f'[[{page}]]' for page in usage)
output.write(f'\n|-\n| [[:{filename}]] || {licenses_list} || {usage_list}')
output.write('\n|}')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment