Created
June 10, 2022 14:01
-
-
Save walshbr/ef1ebfb4550570047cec5d5ecf45ebd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import necessary packages for webscraping. | |
from dataclasses import replace | |
from bs4 import BeautifulSoup | |
from urllib import request | |
from dateutil.parser import parse | |
import time | |
import random | |
import os | |
import re | |
def is_date(string, fuzzy=False): | |
""" | |
Return whether the string can be interpreted as a date. | |
:param string: str, string to check for date | |
:param fuzzy: bool, ignore unknown tokens in string if True | |
""" | |
try: | |
parse(string, fuzzy=fuzzy) | |
return True | |
except ValueError: | |
return False | |
def get_issue_link(link): | |
html = request.urlopen(link).read() | |
soup = BeautifulSoup(html, 'html5lib') | |
issue_toc = soup.select('div.textcontent')[0] | |
raw_links = issue_toc.find_all('a') | |
actual_issue_links = [] | |
for raw_link in raw_links: | |
if raw_link.text.startswith('Issue') or raw_link['href'].startswith('https://jitp.commons.gc.cuny.edu/wp-content/plugins/peters-custom-anti-spam-image/custom_anti_spam.php'): | |
pass | |
elif raw_link.has_attr("title") and raw_link['title'] == "Share this article": | |
pass | |
elif raw_link['href'].endswith('#respond') or raw_link['href'].endswith('#comments') or '#comment' in raw_link['href']: | |
pass | |
elif raw_link.text in ['Attribution-NonCommercial-ShareAlike 4.0 International', 'Previous:', 'Next:', 'Learn how your comment data is processed', 'table of contents', 'Introduction /', 'JITP Issue 8 is now live! | Laura Wildemann Kane','JITP Issue 9 is now live! | Laura Wildemann Kane','Happenings – VREPS','Re-viewing Digital Technologies and Art History – DAHS','Wandering Volunteer Park /', '\n', 'Special Feature: Behind the Seams']: | |
pass | |
elif raw_link['href'] in ['https://creativecommons.org/licenses/by-nc-sa/4.0/', 'http://teacherstech.net/?p=10236']: | |
pass | |
elif is_date(raw_link.text): | |
pass | |
elif raw_link.text in [text.replace(u'\xa0', u'') for link, text in actual_issue_links]: | |
pass | |
else: | |
actual_issue_links.append((raw_link['href'], raw_link.text.replace(u'\xa0', u''))) | |
return actual_issue_links | |
def scrape_contents_of_an_article(article_link): | |
html = request.urlopen(article_link).read() | |
soup = BeautifulSoup(html, 'html5lib') | |
# swap this line and the following one to grab only the article and not the comments | |
# issue_contents = soup.select('article')[0] | |
issue_contents = soup.select('div#main')[0] | |
# add 'li.a[href$="#comments' to get rid of comments | |
list_of_junk = ['div.tagslist','div.iw-social-share','a[href^="https://jitp.commons.gc.cuny.edu/category/issues"]', 'section#post-nav', 'div.comment-respond','p.akismet_comment_form_privacy_notice', 'p[style="display: none !important;"]', 'section.comments p.buttons', 'section.comments img', 'img.avatar', 'div.featimg.animated', 'div.cat'] | |
for junk in list_of_junk: | |
# print(junk) | |
try: | |
issue_contents.select(junk)[0].decompose() | |
except: | |
pass | |
# print('could not match ' + junk) | |
try: | |
for item in issue_contents.select(junk): | |
item.decompose() | |
except: | |
print('not a thing we can loop over') | |
# get rid of comment links and reformat date to not be a line item | |
try: | |
pass | |
replace_text = issue_contents.select('ul.textinfo')[0].find_all('li')[1].text.strip() | |
# replace_text = '<p>' + replace_text + '</p>' | |
# replace_text = BeautifulSoup(replace_text) | |
issue_contents.select('ul.textinfo')[0].replace_with(replace_text) | |
# text_info = issue_contents.select('ul.textinfo')[0].find_all('li') | |
# # text_info[0].decompose() | |
# text_info.contents = text_info[1].text | |
except: | |
pass | |
# search for image tags and replace direct links | |
for img in issue_contents.find_all('img'): | |
img['src'] = re.sub( | |
r'https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+',"images", img['src']) | |
for sup_tag in issue_contents.find_all("sup", {"class": "footnote"}): | |
del sup_tag.findChild('a')['onclick'] | |
# setting the byline tag | |
print(article_link) | |
try: | |
for hit in issue_contents.find_all('h2', {"class": 'byline'}): | |
hit.name = 'p' | |
except: | |
# if no h2 with byline - just pass | |
# but issue right now is that sometimes there is no byline class but there is an h2. you could say "turn the first h2 into a p tag, but you won't know universally that is the case" | |
print('Fail: ' + article_link) | |
pass | |
# strip brackets from notes | |
for hit in issue_contents.find_all('a', {'class': 'ftn'}): | |
hit.string.replace_with(re.sub('[\[\]]', '', hit.string)) | |
for hit in issue_contents.find_all('a', {'class': 'ftnref'}): | |
hit.string.replace_with(re.sub('[\[\]]', '', hit.string)) | |
return issue_contents | |
def clean_issue(title, issue_contents): | |
metadata_title = "<head>\n<meta name=\"dc.title\" content=\"" + title + "\">\n</head>" | |
clean_contents = metadata_title + str(issue_contents) | |
# re.sub( | |
# r'img src="https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+',"<img src=\"images", clean_contents) | |
return clean_contents | |
def scrape_issue(issue_title,issue_links): | |
if not os.path.exists(issue_title): | |
os.mkdir(issue_title) | |
print('Processing ' + issue_title) | |
for link,title in issue_links: | |
print(title) | |
contents = scrape_contents_of_an_article(link) | |
clean_contents = clean_issue(title, contents) | |
with open(os.path.join(issue_title, title.replace('/', ' ') +'.html'), 'w') as fout: | |
fout.write(clean_contents) | |
def get_main_toc_links(): | |
# store the url we want to work with in the variable 'url' | |
url = 'https://jitp.commons.gc.cuny.edu/issues/' | |
html = request.urlopen(url).read() | |
# turn it into soup | |
soup = BeautifulSoup(html, 'html5lib') | |
# grab just the toc from the page | |
toc = soup.select('div.textcontent')[0] | |
# grab all the anchor tags from the toc but throw away the ones about cc lincensing | |
raw_links = toc.find_all('a')[1:-2] | |
actual_links = [(raw_link['href'], raw_link.text.replace(u'\xa0', u' ').replace('Table of Contents: ','')) for raw_link in raw_links] | |
return actual_links | |
def get_all_issue_links(main_toc_links): | |
links_for_individual_issues = {} | |
# MODIFY HERE TO REDUCE NUMBER OF ISSUES SCRAPED | |
for issue_toc_link, issue_title in main_toc_links: | |
max_sleep = 5 | |
time.sleep(random.random() * max_sleep) | |
print('=====') | |
print('Scraping ' + issue_title) | |
links_for_individual_issues[issue_title] = get_issue_link(issue_toc_link) | |
return links_for_individual_issues | |
def main(): | |
# get all the main toc links | |
main_toc_links = get_main_toc_links() | |
main_toc_links.reverse() | |
# use the main toc links to get each issue link | |
links_for_individual_issues = get_all_issue_links(main_toc_links) | |
for issue_title,issue_links in links_for_individual_issues.items(): | |
scrape_issue(issue_title, issue_links) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment