Created
May 15, 2019 20:24
-
-
Save notconfusing/3e743435a4a6da98e9c68d6c58968a0d to your computer and use it in GitHub Desktop.
Wikipedia User Name Validator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import mwapi\n", | |
"\n", | |
"def validate_username(user_name, lang):\n", | |
" \"\"\"\n", | |
" Normalize a single user name via the wikimedia API\n", | |
" :param user_name: the user-name to normalize\n", | |
" :param lang: the language to make an api session if none supplied\n", | |
" :return: dict like {'user_id': 123, 'name': 'canonical username'}\n", | |
" \"\"\"\n", | |
" mwapi_session = mwapi.Session(f'https://{lang}.wikipedia.org', user_agent=\"CivilServant Username Normalizer <operations@civilservant.io>\")\n", | |
" user_name_resp = mwapi_session.get(action='query', list='users', ususers=user_name)\n", | |
" try:\n", | |
" name_id = user_name_resp['query']['users'][0]\n", | |
" return {\"valid\":True, \"user_id\": name_id['userid'], \"canonical_user_name\": name_id['name']}\n", | |
" except Exception as e:\n", | |
" e = \"User name not found\" if isinstance(e, KeyError) else e\n", | |
" return {\"valid\":False, \"error\":e}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"you can provide a string, and the canonical name might be different, wikimedia capitlizes the first letter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 13023703, 'canonical_user_name': 'Maximilianklein'}" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('maximilianklein', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"of course exact matches work" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 13023703, 'canonical_user_name': 'Maximilianklein'}" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('Maximilianklein', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"so to does the convention \"User:\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 13023703, 'canonical_user_name': 'Maximilianklein'}" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('User:Maximilianklein', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Note that the language is important for user_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 33564818, 'canonical_user_name': 'Juliakamin'}" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('User:Juliakamin', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"in french it's different" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 3113092, 'canonical_user_name': 'Juliakamin'}" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('User:Juliakamin', 'fr')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Spaces or underscores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 7723863, 'canonical_user_name': 'Piano non troppo'}" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('Piano non troppo', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Should return the same" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': True, 'user_id': 7723863, 'canonical_user_name': 'Piano non troppo'}" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('Piano_non_troppo', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"source": [ | |
"Invalid user names should return with \"valid:False\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'valid': False, 'error': 'User name not found'}" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"validate_username('doesnt exist user name', 'en')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment