Skip to content

Instantly share code, notes, and snippets.

@rpryzant
Created July 16, 2019 12:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rpryzant/561cc1b4d372cce7479fd14290eacbc3 to your computer and use it in GitHub Desktop.
Save rpryzant/561cc1b4d372cce7479fd14290eacbc3 to your computer and use it in GitHub Desktop.
wiki text cleaner
def rm_refs(x):
REF_RE = '<ref([-\w=" <>]+)?>.*?<([ ]+)?\/([ ]+)?ref>'
x = re.sub(REF_RE, ' ', x)
# leading </ref>
if '</ref>' in x:
x = re.sub(REF_RE, ' ', '<ref>' + x)
# trailing <ref>
if '<ref' in x:
x = re.sub(REF_RE, ' ', x + '</ref>')
return x
def clean_wikitext(token_list):
x = ' '.join(token_list)
# ascii only
x = ''.join(filter(lambda x: x in string.printable, x))
# preemptively remove <ref>'s (including uncompleted)
x = x.strip()
x = rm_refs(x)
# collapse multispaces
x = re.sub('[ ]+', ' ', x)
parse = mwparserfromhell.parse(x)
plaintext = parse.strip_code()
plaintext = rm_refs(plaintext) # get refs again? some things missed
# collapse multispaces
plaintext = re.sub('[ ]+', ' ', plaintext)
# parse again to hit complicatd nested wikicode like 21055249
parse = mwparserfromhell.parse(plaintext)
plaintext = parse.strip_code()
# ignore lines starting with ! or | (likely table artifacts)
if plaintext.startswith('?') or plaintext.startswith('|'):
plaintext = ''
# ignore lines without text, e.g. ( , , , , ) or ]]
if not re.findall('\w', plaintext):
plaintext = ''
# parse AGAIN again to hit remaining links e.g. 377258469
plaintext = plaintext.replace('[ ', '[').replace(' ]', ']')
parse = mwparserfromhell.parse(plaintext)
plaintext = parse.strip_code()
# at this point just rm all brackets
plaintext = plaintext.replace(']', '').replace('[', '')
# rm html
plaintext = re.sub('http\S+', '', plaintext)
# rm parents with nothing in them, e.g. (; )
plaintext = re.sub('\([^\w]*\)', '', plaintext)
# rm remining <del>, <ins> (valid tags should already have been taken parsed)
plaintext = re.sub('<\/?(del|ins)([-\w=" <>]+)?>', '', plaintext)
# fuck stars
plaintext = plaintext.replace('*', '')
# rm table fragments
plaintext = re.sub('(right[ ]?\||left[ ]?\||thumb[ ]?\||frame[ ]?\||\d+px[ ]?\|)', '', plaintext)
# ignore timestamp sentences
if 'retrieved on' in plaintext.lower():
plaintext = ''
# msc html missed
plaintext = plaintext.replace('<blockquote>', '')
# remove tabs and newlines (those is our deliminators beeyotch)
plaintext.replace('\t', ' ')
plaintext.replace('\n', ' ')
plaintext.replace('\r', '')
# collapse multispaces (again again)
plaintext = re.sub('[ ]+', ' ', plaintext).strip()
return plaintext
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment