Created
July 2, 2019 06:27
-
-
Save matteoferla/4a644fab0735dd41e2c1fb7827bf3e2f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikitextparser as wtp | |
import re | |
####### code to convert template to dictionary | |
def arg_to_val(arg): | |
val = arg.value | |
for t in arg.templates: | |
tval = t.arguments[0].value | |
if t.normal_name() in ('nowrap', 'val'): | |
if any(['ul' in a.name for a in t.arguments]): #unit! | |
tval += [a.value for a in t.arguments if 'ul' in a.name ][0] | |
val = val.replace(t.string, tval) | |
val = re.sub('<.*?\/>','',val) #remove self closing tags | |
val = re.sub('<.*?>.*?<\/.*?>','',val) # remove tags | |
val = re.sub('<!--.*?-->','',val) # remove comments | |
val = val.replace('–','-') # en dash to hyphen minus | |
val = val.replace('–','-') # em dash to hyphen minus | |
val = re.sub('±\s+\d+\.?\d*','', val) #clear error for safety | |
val = val.rstrip().lstrip() | |
return val | |
def arg_to_key(arg): | |
return arg.name.rstrip().lstrip() | |
def template_to_dict(template): | |
return {arg_to_key(arg): arg_to_val(arg) for arg in template.arguments} | |
########### example | |
for t in wtp.parse(text).templates: | |
if t.normal_name() == 'Starbox astrometry': # not using t.name has training space. | |
print(template_to_dict(t)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment