Created
February 9, 2024 14:22
-
-
Save jacKlinc/c3e1464e434f4dc5871c547120039d4b to your computer and use it in GitHub Desktop.
Download all Bellingcat articles and save HTML content to CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from functools import reduce | |
from dataclasses import dataclass | |
from bs4 import BeautifulSoup | |
from newspaper import Article | |
import pandas as pd | |
BASE_URL = "https://www.bellingcat.com" | |
BELLINGCAT_START_YEAR = 2014 # earliest article on site | |
@dataclass | |
class ArticleItem: | |
year: int | |
month: int | |
url: str | |
def list_months_article(year: int, month: int) -> list[ArticleItem]: | |
res = requests.get(f"{BASE_URL}/news/{year}/0{month}/") | |
articles = BeautifulSoup(res.content, "html.parser") | |
# searches HTML for articles list | |
news_item_tags = articles.find_all("div", {"class": "news_item__image"}) | |
create_object = lambda tag: ArticleItem(year, month, tag.findChild("a")["href"]) | |
return [create_object(t) for t in news_item_tags] | |
def flatten_list(x: list, y: list) -> list: | |
return x + y | |
# TODO refactor | |
def list_years_articles(year: int) -> list[ArticleItem]: | |
nested_links = [list_months_article(year, i) for i in range(1, 13)] | |
return reduce(flatten_list, nested_links) | |
def list_all_articles() -> list[ArticleItem]: | |
nested_links = [list_years_articles(y) for y in range(BELLINGCAT_START_YEAR, 2024)] | |
return reduce(flatten_list, nested_links) | |
def get_article_text(url: str) -> str: | |
article = Article(url) | |
article.download() | |
article.parse() | |
return article.text | |
def main(): | |
# list all articles available on Bellingcat | |
all_articles = list_all_articles() # 29s to run | |
df = pd.DataFrame(all_articles) | |
df["path"] = df.url.apply(lambda x: x.split(BASE_URL, 1)[1]) | |
print("Table info:") | |
print(df.info) | |
articles_text = df.url.map(get_article_text) # takes 5m40s to run | |
df["articles_text"] = articles_text | |
# save to file | |
# df.to_csv("all-bellingcat-articles.csv", index=False) | |
# TODO check random months for URLs | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment