Skip to content

Instantly share code, notes, and snippets.

@walshbr
Created June 10, 2022 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walshbr/ef1ebfb4550570047cec5d5ecf45ebd2 to your computer and use it in GitHub Desktop.
Save walshbr/ef1ebfb4550570047cec5d5ecf45ebd2 to your computer and use it in GitHub Desktop.
# import necessary packages for webscraping.
from dataclasses import replace
from bs4 import BeautifulSoup
from urllib import request
from dateutil.parser import parse
import time
import random
import os
import re
def is_date(string, fuzzy=False):
"""
Return whether the string can be interpreted as a date.
:param string: str, string to check for date
:param fuzzy: bool, ignore unknown tokens in string if True
"""
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def get_issue_link(link):
html = request.urlopen(link).read()
soup = BeautifulSoup(html, 'html5lib')
issue_toc = soup.select('div.textcontent')[0]
raw_links = issue_toc.find_all('a')
actual_issue_links = []
for raw_link in raw_links:
if raw_link.text.startswith('Issue') or raw_link['href'].startswith('https://jitp.commons.gc.cuny.edu/wp-content/plugins/peters-custom-anti-spam-image/custom_anti_spam.php'):
pass
elif raw_link.has_attr("title") and raw_link['title'] == "Share this article":
pass
elif raw_link['href'].endswith('#respond') or raw_link['href'].endswith('#comments') or '#comment' in raw_link['href']:
pass
elif raw_link.text in ['Attribution-NonCommercial-ShareAlike 4.0 International', 'Previous:', 'Next:', 'Learn how your comment data is processed', 'table of contents', 'Introduction /', 'JITP Issue 8 is now live! | Laura Wildemann Kane','JITP Issue 9 is now live! | Laura Wildemann Kane','Happenings – VREPS','Re-viewing Digital Technologies and Art History – DAHS','Wandering Volunteer Park /', '\n', 'Special Feature: Behind the Seams']:
pass
elif raw_link['href'] in ['https://creativecommons.org/licenses/by-nc-sa/4.0/', 'http://teacherstech.net/?p=10236']:
pass
elif is_date(raw_link.text):
pass
elif raw_link.text in [text.replace(u'\xa0', u'') for link, text in actual_issue_links]:
pass
else:
actual_issue_links.append((raw_link['href'], raw_link.text.replace(u'\xa0', u'')))
return actual_issue_links
def scrape_contents_of_an_article(article_link):
html = request.urlopen(article_link).read()
soup = BeautifulSoup(html, 'html5lib')
# swap this line and the following one to grab only the article and not the comments
# issue_contents = soup.select('article')[0]
issue_contents = soup.select('div#main')[0]
# add 'li.a[href$="#comments' to get rid of comments
list_of_junk = ['div.tagslist','div.iw-social-share','a[href^="https://jitp.commons.gc.cuny.edu/category/issues"]', 'section#post-nav', 'div.comment-respond','p.akismet_comment_form_privacy_notice', 'p[style="display: none !important;"]', 'section.comments p.buttons', 'section.comments img', 'img.avatar', 'div.featimg.animated', 'div.cat']
for junk in list_of_junk:
# print(junk)
try:
issue_contents.select(junk)[0].decompose()
except:
pass
# print('could not match ' + junk)
try:
for item in issue_contents.select(junk):
item.decompose()
except:
print('not a thing we can loop over')
# get rid of comment links and reformat date to not be a line item
try:
pass
replace_text = issue_contents.select('ul.textinfo')[0].find_all('li')[1].text.strip()
# replace_text = '<p>' + replace_text + '</p>'
# replace_text = BeautifulSoup(replace_text)
issue_contents.select('ul.textinfo')[0].replace_with(replace_text)
# text_info = issue_contents.select('ul.textinfo')[0].find_all('li')
# # text_info[0].decompose()
# text_info.contents = text_info[1].text
except:
pass
# search for image tags and replace direct links
for img in issue_contents.find_all('img'):
img['src'] = re.sub(
r'https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+',"images", img['src'])
for sup_tag in issue_contents.find_all("sup", {"class": "footnote"}):
del sup_tag.findChild('a')['onclick']
# setting the byline tag
print(article_link)
try:
for hit in issue_contents.find_all('h2', {"class": 'byline'}):
hit.name = 'p'
except:
# if no h2 with byline - just pass
# but issue right now is that sometimes there is no byline class but there is an h2. you could say "turn the first h2 into a p tag, but you won't know universally that is the case"
print('Fail: ' + article_link)
pass
# strip brackets from notes
for hit in issue_contents.find_all('a', {'class': 'ftn'}):
hit.string.replace_with(re.sub('[\[\]]', '', hit.string))
for hit in issue_contents.find_all('a', {'class': 'ftnref'}):
hit.string.replace_with(re.sub('[\[\]]', '', hit.string))
return issue_contents
def clean_issue(title, issue_contents):
metadata_title = "<head>\n<meta name=\"dc.title\" content=\"" + title + "\">\n</head>"
clean_contents = metadata_title + str(issue_contents)
# re.sub(
# r'img src="https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+',"<img src=\"images", clean_contents)
return clean_contents
def scrape_issue(issue_title,issue_links):
if not os.path.exists(issue_title):
os.mkdir(issue_title)
print('Processing ' + issue_title)
for link,title in issue_links:
print(title)
contents = scrape_contents_of_an_article(link)
clean_contents = clean_issue(title, contents)
with open(os.path.join(issue_title, title.replace('/', ' ') +'.html'), 'w') as fout:
fout.write(clean_contents)
def get_main_toc_links():
# store the url we want to work with in the variable 'url'
url = 'https://jitp.commons.gc.cuny.edu/issues/'
html = request.urlopen(url).read()
# turn it into soup
soup = BeautifulSoup(html, 'html5lib')
# grab just the toc from the page
toc = soup.select('div.textcontent')[0]
# grab all the anchor tags from the toc but throw away the ones about cc lincensing
raw_links = toc.find_all('a')[1:-2]
actual_links = [(raw_link['href'], raw_link.text.replace(u'\xa0', u' ').replace('Table of Contents: ','')) for raw_link in raw_links]
return actual_links
def get_all_issue_links(main_toc_links):
links_for_individual_issues = {}
# MODIFY HERE TO REDUCE NUMBER OF ISSUES SCRAPED
for issue_toc_link, issue_title in main_toc_links:
max_sleep = 5
time.sleep(random.random() * max_sleep)
print('=====')
print('Scraping ' + issue_title)
links_for_individual_issues[issue_title] = get_issue_link(issue_toc_link)
return links_for_individual_issues
def main():
# get all the main toc links
main_toc_links = get_main_toc_links()
main_toc_links.reverse()
# use the main toc links to get each issue link
links_for_individual_issues = get_all_issue_links(main_toc_links)
for issue_title,issue_links in links_for_individual_issues.items():
scrape_issue(issue_title, issue_links)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment