Skip to content

Instantly share code, notes, and snippets.

@barik
Created June 11, 2021 19:04
Show Gist options
  • Save barik/650592696ff9ff6a118ece2b5c084115 to your computer and use it in GitHub Desktop.
Save barik/650592696ff9ff6a118ece2b5c084115 to your computer and use it in GitHub Desktop.
ijoc_scrape.py
from bs4 import BeautifulSoup
import requests
import csv
output = []
archives = requests.get("https://ijoc.org/index.php/ijoc/issue/archive")
soup = BeautifulSoup(archives.content, features="lxml")
results = soup.div.find_all("div", {"id": "issue"})
for r in results:
link = r.a
# Get the articles for the issue.
issue_articles = requests.get(link["href"])
soup_articles = BeautifulSoup(
issue_articles.content.decode("utf-8"), features="lxml"
)
articles = soup_articles.find_all("table", class_="tocArticle")
for a in articles:
title = a.find(class_="tocTitle")
article_link = a.find(class_="file")
rollup = [link["href"], title.text, article_link["href"]]
output.append(rollup)
with open("output.csv", "w", newline="", encoding="utf-8") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["issue", "title", "href"])
csv_writer.writerows(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment