gullyn/analyze.py

## analyze.py
import json, requests, re

def main():
	print(page_length("Ivory_Coast", True))
	countries = json.loads(open("countries.json", "r").read())
	for country in countries:
		link = country[1].split("wiki/")[1]
		length = page_length(link, True)
		print(country[0], length)
		results = open("results.txt", "a")
		results.write(f"{country[0]},{length}\n")
		results.close()

def page_length(title, recursive=False):
	req = requests.get(f"https://en.wikipedia.org/w/api.php?action=parse&page={title}&format=json")
	content = json.loads(req.content)["parse"]["text"]["*"]
	sum_len = len(content)

	if not recursive:
		return sum_len

	links = get_links(content, title)
	for link in links:
		link_len = page_length(link)
		sum_len += link_len

	return sum_len

def get_links(content, title):
	regex = r"Main articles?: (?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?)?(?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?){1,}"
	matches = re.findall(regex, content)
	new_matches = []
	for match in matches:
		for rm in match:
			if len(rm) > 0 and title.split(" ")[0].lower() in rm.lower():
				new_matches.append(rm.split("#")[0])
	return new_matches

if __name__ == "__main__":
	main()
	import json, requests, re

	def main():
	print(page_length("Ivory_Coast", True))
	countries = json.loads(open("countries.json", "r").read())
	for country in countries:
	link = country[1].split("wiki/")[1]
	length = page_length(link, True)
	print(country[0], length)
	results = open("results.txt", "a")
	results.write(f"{country[0]},{length}\n")
	results.close()

	def page_length(title, recursive=False):
	req = requests.get(f"https://en.wikipedia.org/w/api.php?action=parse&page={title}&format=json")
	content = json.loads(req.content)["parse"]["text"]["*"]
	sum_len = len(content)

	if not recursive:
	return sum_len

	links = get_links(content, title)
	for link in links:
	link_len = page_length(link)
	sum_len += link_len

	return sum_len

	def get_links(content, title):
	regex = r"Main articles?: (?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?)?(?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?){1,}"
	matches = re.findall(regex, content)
	new_matches = []
	for match in matches:
	for rm in match:
	if len(rm) > 0 and title.split(" ")[0].lower() in rm.lower():
	new_matches.append(rm.split("#")[0])
	return new_matches

	if __name__ == "__main__":
	main()