Skip to content

Instantly share code, notes, and snippets.

@jacKlinc
Created February 9, 2024 14:22
Show Gist options
  • Save jacKlinc/c3e1464e434f4dc5871c547120039d4b to your computer and use it in GitHub Desktop.
Save jacKlinc/c3e1464e434f4dc5871c547120039d4b to your computer and use it in GitHub Desktop.
Download all Bellingcat articles and save HTML content to CSV file
import requests
from functools import reduce
from dataclasses import dataclass
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
BASE_URL = "https://www.bellingcat.com"
BELLINGCAT_START_YEAR = 2014 # earliest article on site
@dataclass
class ArticleItem:
year: int
month: int
url: str
def list_months_article(year: int, month: int) -> list[ArticleItem]:
res = requests.get(f"{BASE_URL}/news/{year}/0{month}/")
articles = BeautifulSoup(res.content, "html.parser")
# searches HTML for articles list
news_item_tags = articles.find_all("div", {"class": "news_item__image"})
create_object = lambda tag: ArticleItem(year, month, tag.findChild("a")["href"])
return [create_object(t) for t in news_item_tags]
def flatten_list(x: list, y: list) -> list:
return x + y
# TODO refactor
def list_years_articles(year: int) -> list[ArticleItem]:
nested_links = [list_months_article(year, i) for i in range(1, 13)]
return reduce(flatten_list, nested_links)
def list_all_articles() -> list[ArticleItem]:
nested_links = [list_years_articles(y) for y in range(BELLINGCAT_START_YEAR, 2024)]
return reduce(flatten_list, nested_links)
def get_article_text(url: str) -> str:
article = Article(url)
article.download()
article.parse()
return article.text
def main():
# list all articles available on Bellingcat
all_articles = list_all_articles() # 29s to run
df = pd.DataFrame(all_articles)
df["path"] = df.url.apply(lambda x: x.split(BASE_URL, 1)[1])
print("Table info:")
print(df.info)
articles_text = df.url.map(get_article_text) # takes 5m40s to run
df["articles_text"] = articles_text
# save to file
# df.to_csv("all-bellingcat-articles.csv", index=False)
# TODO check random months for URLs
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment