Created
July 16, 2019 12:22
-
-
Save rpryzant/561cc1b4d372cce7479fd14290eacbc3 to your computer and use it in GitHub Desktop.
wiki text cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rm_refs(x): | |
REF_RE = '<ref([-\w=" <>]+)?>.*?<([ ]+)?\/([ ]+)?ref>' | |
x = re.sub(REF_RE, ' ', x) | |
# leading </ref> | |
if '</ref>' in x: | |
x = re.sub(REF_RE, ' ', '<ref>' + x) | |
# trailing <ref> | |
if '<ref' in x: | |
x = re.sub(REF_RE, ' ', x + '</ref>') | |
return x | |
def clean_wikitext(token_list): | |
x = ' '.join(token_list) | |
# ascii only | |
x = ''.join(filter(lambda x: x in string.printable, x)) | |
# preemptively remove <ref>'s (including uncompleted) | |
x = x.strip() | |
x = rm_refs(x) | |
# collapse multispaces | |
x = re.sub('[ ]+', ' ', x) | |
parse = mwparserfromhell.parse(x) | |
plaintext = parse.strip_code() | |
plaintext = rm_refs(plaintext) # get refs again? some things missed | |
# collapse multispaces | |
plaintext = re.sub('[ ]+', ' ', plaintext) | |
# parse again to hit complicatd nested wikicode like 21055249 | |
parse = mwparserfromhell.parse(plaintext) | |
plaintext = parse.strip_code() | |
# ignore lines starting with ! or | (likely table artifacts) | |
if plaintext.startswith('?') or plaintext.startswith('|'): | |
plaintext = '' | |
# ignore lines without text, e.g. ( , , , , ) or ]] | |
if not re.findall('\w', plaintext): | |
plaintext = '' | |
# parse AGAIN again to hit remaining links e.g. 377258469 | |
plaintext = plaintext.replace('[ ', '[').replace(' ]', ']') | |
parse = mwparserfromhell.parse(plaintext) | |
plaintext = parse.strip_code() | |
# at this point just rm all brackets | |
plaintext = plaintext.replace(']', '').replace('[', '') | |
# rm html | |
plaintext = re.sub('http\S+', '', plaintext) | |
# rm parents with nothing in them, e.g. (; ) | |
plaintext = re.sub('\([^\w]*\)', '', plaintext) | |
# rm remining <del>, <ins> (valid tags should already have been taken parsed) | |
plaintext = re.sub('<\/?(del|ins)([-\w=" <>]+)?>', '', plaintext) | |
# fuck stars | |
plaintext = plaintext.replace('*', '') | |
# rm table fragments | |
plaintext = re.sub('(right[ ]?\||left[ ]?\||thumb[ ]?\||frame[ ]?\||\d+px[ ]?\|)', '', plaintext) | |
# ignore timestamp sentences | |
if 'retrieved on' in plaintext.lower(): | |
plaintext = '' | |
# msc html missed | |
plaintext = plaintext.replace('<blockquote>', '') | |
# remove tabs and newlines (those is our deliminators beeyotch) | |
plaintext.replace('\t', ' ') | |
plaintext.replace('\n', ' ') | |
plaintext.replace('\r', '') | |
# collapse multispaces (again again) | |
plaintext = re.sub('[ ]+', ' ', plaintext).strip() | |
return plaintext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment