Last active
May 3, 2020 20:39
-
-
Save ZoeLeBlanc/b4217a2134bb3bace71ef87aef5ac974 to your computer and use it in GitHub Desktop.
Edits to Rebecca Koeser's script to generate dataset from DH Q&A archive (preliminary)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Script to parse data DH Q&A archive. | |
Install python dependencies: | |
pip install beautifulsoup4 lxml | |
Clone DH Q&A archive repository: | |
https://github.com/achorg/DH-Answers-Archive | |
Run this script in the top-level directory of the repository. | |
''' | |
import csv | |
import glob | |
import os | |
import re | |
import pandas as pd | |
from bs4 import BeautifulSoup, Comment | |
baseurl = 'http://digitalhumanities.org/answers' | |
questions = [] | |
question_fieldnames = [ | |
'url', 'question', 'date', 'tags', 'author', 'author url', | |
'content', '# responses', '# best responses' | |
] | |
for index_path, path in enumerate(glob.glob('topic/*/index.html')): | |
# | |
print(path) | |
qdata = {} | |
with open(path) as topicdoc: | |
soup = BeautifulSoup(topicdoc, 'html.parser') | |
# page title is question (summary/brief) | |
qdata['question'] = soup.find('h2').get_text() | |
tags = soup.find_all('a', rel='tag') | |
qdata['tags'] = ';'.join([t.get_text() for t in tags]) | |
# topic meta should include url for topic, | |
# but is not completely reliable! | |
# url = soup.find('ul', class_="topicmeta").find('a')['href'] | |
# url = url.split('#')[0] # remove anchor link | |
# Use base filename for url instead | |
qdata['url'] = '%s/%s' % (baseurl, os.path.dirname(path)) | |
# Get all posts using regex and allowing us to access their unique post-id | |
posts = soup.findAll(id=re.compile("post-")) | |
posts_data = [] | |
for index, post in enumerate(posts): | |
# Store all info post info in dictionary | |
post_data = {} | |
# Loop through posts and get author | |
author = post.find('div', class_='threadauthor') | |
author_url = author.a['href'] | |
if author_url.startswith('/'): | |
author_url = '%s%s' % (baseurl, author_url) | |
post_data['author url'] = author_url | |
post_data['author'] = author.find('strong').get_text() | |
# Get post id | |
post_data['post_id'] = post.get('id') | |
# If first post indicate that it original author | |
if index == 0: | |
post_data['is_initial_post'] = True | |
else: | |
# Else is a reply | |
post_data['is_initial_post'] = False | |
# Find and get post content | |
post_content = post.find('div', class_='post') | |
# Remove tweet buttons | |
if post_content.find('div', class_="social-it") is not None: | |
post_content.find('div', class_="social-it").extract() | |
post_data['post_content'] = post_content.get_text() | |
# Get best answer | |
post_data['is_best_answer'] = True if post.find('div', class_='best_answer') is not None else False | |
# Append post data to list | |
posts_data.append(post_data) | |
# Turn posts into dataframe | |
posts_df = pd.DataFrame(posts_data) | |
# html doesn't have a proper date but RSS should | |
# get rss filename from rss link | |
rss = soup.find('a', class_="rss-link")['href'].lstrip('/') | |
if os.path.exists(rss): | |
with open(rss) as rssdoc: | |
rss_soup = BeautifulSoup(rssdoc, 'lxml') | |
# Search for all dates | |
pubdates = rss_soup.findAll('pubdate') | |
# Check if dates exist | |
if pubdates: | |
# Get post ids by looking for links that are previous siblings of pubdate | |
post_ids = [d.previous_sibling.split('#')[1].replace('\n', '').replace('\t','') for d in pubdates if '#' in d.previous_sibling] | |
# Store first pubdate as the official post date | |
qdata['date'] = pubdates[0].get_text() | |
# Get text for dates | |
dates =[date.get_text() for date in pubdates[1:]] | |
# Create a new dataframe for post_ids and dates, but reverse order to correspond to html page rather than RSS feed | |
rss_content = pd.DataFrame({'post_id': post_ids[::-1], 'post_date':dates[::-1]}) | |
# Merge with posts dataframe on post_id. Occasionally there are duplicate posts and this will help us identify them | |
posts_df = posts_df.merge(rss_content, on='post_id', how='outer') | |
else: | |
print('ERROR: RSS file has no content: %s' % rss) | |
else: | |
print('ERROR: Missing RSS file: %s' % rss) | |
# If no RSS feed set dates to none | |
qdata['date'] = None | |
posts_df['post_date'] = None | |
# NOTE: missing 11 topic RSS feeds | |
# may be able to get date from tag feeds | |
# Turn metadata into dataframe | |
question_metadata_df = pd.DataFrame.from_dict([qdata], orient='columns') | |
# Merge two dataframes together creating nulls | |
merged_df = pd.concat([posts_df, question_metadata_df], axis=1) | |
# Fill nulls for metadata columns with initial values | |
merged_df[['date', 'question', 'tags', 'url']] = merged_df[['date', 'question', 'tags', 'url']].fillna(method='ffill') | |
# Define output file name | |
output_file = 'zoe_scraped_dhqa_archive.csv' | |
# Check if first time creating file, if yes delete existing files with same name | |
if (os.path.exists(output_file)) and (index_path is 0): | |
os.remove(output_file) | |
# Check if file exists | |
if os.path.exists(output_file): | |
# If file exists append for each loop so that way you don't loose your data | |
merged_df.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8-sig') | |
else: | |
# if file doesn't exist then create it | |
merged_df.to_csv(output_file, header=True, index=False, encoding='utf-8-sig') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment