Skip to content

Instantly share code, notes, and snippets.

@jbothma
Created September 30, 2023 09:21
Show Gist options
  • Save jbothma/9ad66f8b8ffe4d5e1c3dd956985845d2 to your computer and use it in GitHub Desktop.
Save jbothma/9ad66f8b8ffe4d5e1c3dd956985845d2 to your computer and use it in GitHub Desktop.
# python scrape_wikipedia.py countries.json intros flags
#
# countries.json is countries search facet.
import requests
from lxml import html
import json
from sys import argv
import re
from normality.cleaning import collapse_spaces
from yaml import dump
def clean_first_line(text):
text = collapse_spaces(text)
text = re.sub(" \([^\)]+\)", "", text)
text = re.sub("\[[^\]]{1,20}\]", "", text)
match = re.match("([^\.]+\.[^\.]+\.)", text)
if match:
return match.group(1)
# https://upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/125px-Flag_of_Albania.svg.png
# https://upload.wikimedia.org/wikipedia/commons/3/36/Flag_of_Albania.svg
def clean_flag_url(url):
match = re.match("(//upload.wikimedia.org/.+?\.svg)", url)
if match:
clean_url = "https:" + match.group(1)
clean_url = clean_url.replace("thumb/", "")
return clean_url
with open(argv[1]) as jsonfile:
countries = json.load(jsonfile)
for country in countries:
slug = country["label"].replace(" ", "_")
url = f"https://en.wikipedia.org/wiki/{slug}"
print(url)
response = requests.get(url)
doc = html.fromstring(response.text)
intro = None
tables = doc.xpath('.//table[contains(@class, "infobox")]')
if tables:
first_para = tables[0].getnext()
intro = clean_first_line(first_para.text_content())
if intro is None:
intro = ''
url = ''
flag_images = doc.xpath('.//img[contains(@src, "Flag")]')
if flag_images:
flag_url = clean_flag_url(flag_images[0].get("src"))
else:
flag_url = None
if flag_url:
flag_r = requests.get(flag_url)
if flag_r.status_code == 200:
flag_filename = f"{country['name']}.svg"
flag_path = f"{argv[3]}/{flag_filename}"
with open(flag_path, "wb") as flag_file:
flag_file.write(flag_r.content)
else:
flag_filename = None
else:
flag_filename = None
path = f"{argv[2]}/{country['name']}.md"
preamble = {
"wikipedia_url": url,
"wikipedia_intro": intro,
"summary": "",
"flag": flag_filename
}
with open(path, "w") as outfile:
outfile.write(f"---\n{dump(preamble)}\n---\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment