Skip to content

Instantly share code, notes, and snippets.

@gullyn
Created January 20, 2021 12:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gullyn/73a607e70970d0b151bac5e7c2c2a7e5 to your computer and use it in GitHub Desktop.
Save gullyn/73a607e70970d0b151bac5e7c2c2a7e5 to your computer and use it in GitHub Desktop.
analyze
import json, requests, re
def main():
print(page_length("Ivory_Coast", True))
countries = json.loads(open("countries.json", "r").read())
for country in countries:
link = country[1].split("wiki/")[1]
length = page_length(link, True)
print(country[0], length)
results = open("results.txt", "a")
results.write(f"{country[0]},{length}\n")
results.close()
def page_length(title, recursive=False):
req = requests.get(f"https://en.wikipedia.org/w/api.php?action=parse&page={title}&format=json")
content = json.loads(req.content)["parse"]["text"]["*"]
sum_len = len(content)
if not recursive:
return sum_len
links = get_links(content, title)
for link in links:
link_len = page_length(link)
sum_len += link_len
return sum_len
def get_links(content, title):
regex = r"Main articles?: (?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?)?(?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?){1,}"
matches = re.findall(regex, content)
new_matches = []
for match in matches:
for rm in match:
if len(rm) > 0 and title.split(" ")[0].lower() in rm.lower():
new_matches.append(rm.split("#")[0])
return new_matches
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment