jacKlinc/download_all_bellingcat_articles.py

## download_all_bellingcat_articles.py
import requests
from functools import reduce
from dataclasses import dataclass

from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd

BASE_URL = "https://www.bellingcat.com"
BELLINGCAT_START_YEAR = 2014  # earliest article on site


@dataclass
class ArticleItem:
    year: int
    month: int
    url: str


def list_months_article(year: int, month: int) -> list[ArticleItem]:
    res = requests.get(f"{BASE_URL}/news/{year}/0{month}/")

    articles = BeautifulSoup(res.content, "html.parser")
    # searches HTML for articles list
    news_item_tags = articles.find_all("div", {"class": "news_item__image"})

    create_object = lambda tag: ArticleItem(year, month, tag.findChild("a")["href"])

    return [create_object(t) for t in news_item_tags]


def flatten_list(x: list, y: list) -> list:
    return x + y


# TODO refactor
def list_years_articles(year: int) -> list[ArticleItem]:
    nested_links = [list_months_article(year, i) for i in range(1, 13)]
    return reduce(flatten_list, nested_links)


def list_all_articles() -> list[ArticleItem]:
    nested_links = [list_years_articles(y) for y in range(BELLINGCAT_START_YEAR, 2024)]
    return reduce(flatten_list, nested_links)


def get_article_text(url: str) -> str:
    article = Article(url)
    article.download()
    article.parse()

    return article.text


def main():
    # list all articles available on Bellingcat
    all_articles = list_all_articles()  # 29s to run

    df = pd.DataFrame(all_articles)
    df["path"] = df.url.apply(lambda x: x.split(BASE_URL, 1)[1])
    print("Table info:")
    print(df.info)

    articles_text = df.url.map(get_article_text)  # takes 5m40s to run
    df["articles_text"] = articles_text

    # save to file
    # df.to_csv("all-bellingcat-articles.csv", index=False)

    # TODO check random months for URLs


if __name__ == "__main__":
    main()
	import requests
	from functools import reduce
	from dataclasses import dataclass

	from bs4 import BeautifulSoup
	from newspaper import Article
	import pandas as pd

	BASE_URL = "https://www.bellingcat.com"
	BELLINGCAT_START_YEAR = 2014 # earliest article on site


	@dataclass
	class ArticleItem:
	year: int
	month: int
	url: str


	def list_months_article(year: int, month: int) -> list[ArticleItem]:
	res = requests.get(f"{BASE_URL}/news/{year}/0{month}/")

	articles = BeautifulSoup(res.content, "html.parser")
	# searches HTML for articles list
	news_item_tags = articles.find_all("div", {"class": "news_item__image"})

	create_object = lambda tag: ArticleItem(year, month, tag.findChild("a")["href"])

	return [create_object(t) for t in news_item_tags]


	def flatten_list(x: list, y: list) -> list:
	return x + y


	# TODO refactor
	def list_years_articles(year: int) -> list[ArticleItem]:
	nested_links = [list_months_article(year, i) for i in range(1, 13)]
	return reduce(flatten_list, nested_links)


	def list_all_articles() -> list[ArticleItem]:
	nested_links = [list_years_articles(y) for y in range(BELLINGCAT_START_YEAR, 2024)]
	return reduce(flatten_list, nested_links)


	def get_article_text(url: str) -> str:
	article = Article(url)
	article.download()
	article.parse()

	return article.text


	def main():
	# list all articles available on Bellingcat
	all_articles = list_all_articles() # 29s to run

	df = pd.DataFrame(all_articles)
	df["path"] = df.url.apply(lambda x: x.split(BASE_URL, 1)[1])
	print("Table info:")
	print(df.info)

	articles_text = df.url.map(get_article_text) # takes 5m40s to run
	df["articles_text"] = articles_text

	# save to file
	# df.to_csv("all-bellingcat-articles.csv", index=False)

	# TODO check random months for URLs


	if __name__ == "__main__":
	main()