Last active May 3, 2020 20:39
Edits to Rebecca Koeser's script to generate dataset from DH Q&A archive (preliminary)
#!/usr/bin/env python
Script to parse data DH Q&A archive.
Install python dependencies:
pip install beautifulsoup4 lxml
Clone DH Q&A archive repository:
Run this script in the top-level directory of the repository.
import csv
import glob
import os
import re
import pandas as pd
from bs4 import BeautifulSoup, Comment
baseurl = ''
questions = []
question_fieldnames = [
'url', 'question', 'date', 'tags', 'author', 'author url',
'content', '# responses', '# best responses'
for index_path, path in enumerate(glob.glob('topic/*/index.html')):
qdata = {}
with open(path) as topicdoc:
soup = BeautifulSoup(topicdoc, 'html.parser')
# page title is question (summary/brief)
qdata['question'] = soup.find('h2').get_text()
tags = soup.find_all('a', rel='tag')
qdata['tags'] = ';'.join([t.get_text() for t in tags])
# topic meta should include url for topic,
# but is not completely reliable!
# url = soup.find('ul', class_="topicmeta").find('a')['href']
# url = url.split('#')[0] # remove anchor link
# Use base filename for url instead
qdata['url'] = '%s/%s' % (baseurl, os.path.dirname(path))
# Get all posts using regex and allowing us to access their unique post-id
posts = soup.findAll(id=re.compile("post-"))
posts_data = []
for index, post in enumerate(posts):
# Store all info post info in dictionary
post_data = {}
# Loop through posts and get author
author = post.find('div', class_='threadauthor')
author_url = author.a['href']
if author_url.startswith('/'):
author_url = '%s%s' % (baseurl, author_url)
post_data['author url'] = author_url
post_data['author'] = author.find('strong').get_text()
# Get post id
post_data['post_id'] = post.get('id')
# If first post indicate that it original author
if index == 0:
post_data['is_initial_post'] = True
# Else is a reply
post_data['is_initial_post'] = False
# Find and get post content
post_content = post.find('div', class_='post')
# Remove tweet buttons
if post_content.find('div', class_="social-it") is not None:
post_content.find('div', class_="social-it").extract()
post_data['post_content'] = post_content.get_text()
# Get best answer
post_data['is_best_answer'] = True if post.find('div', class_='best_answer') is not None else False
# Append post data to list
# Turn posts into dataframe
posts_df = pd.DataFrame(posts_data)
# html doesn't have a proper date but RSS should
# get rss filename from rss link
rss = soup.find('a', class_="rss-link")['href'].lstrip('/')
if os.path.exists(rss):
with open(rss) as rssdoc:
rss_soup = BeautifulSoup(rssdoc, 'lxml')
# Search for all dates
pubdates = rss_soup.findAll('pubdate')
# Check if dates exist
if pubdates:
# Get post ids by looking for links that are previous siblings of pubdate
post_ids = [d.previous_sibling.split('#')[1].replace('\n', '').replace('\t','') for d in pubdates if '#' in d.previous_sibling]
# Store first pubdate as the official post date
qdata['date'] = pubdates[0].get_text()
# Get text for dates
dates =[date.get_text() for date in pubdates[1:]]
# Create a new dataframe for post_ids and dates, but reverse order to correspond to html page rather than RSS feed
rss_content = pd.DataFrame({'post_id': post_ids[::-1], 'post_date':dates[::-1]})
# Merge with posts dataframe on post_id. Occasionally there are duplicate posts and this will help us identify them
posts_df = posts_df.merge(rss_content, on='post_id', how='outer')
print('ERROR: RSS file has no content: %s' % rss)
print('ERROR: Missing RSS file: %s' % rss)
# If no RSS feed set dates to none
qdata['date'] = None
posts_df['post_date'] = None
# NOTE: missing 11 topic RSS feeds
# may be able to get date from tag feeds
# Turn metadata into dataframe
question_metadata_df = pd.DataFrame.from_dict([qdata], orient='columns')
# Merge two dataframes together creating nulls
merged_df = pd.concat([posts_df, question_metadata_df], axis=1)
# Fill nulls for metadata columns with initial values
merged_df[['date', 'question', 'tags', 'url']] = merged_df[['date', 'question', 'tags', 'url']].fillna(method='ffill')
# Define output file name
output_file = 'zoe_scraped_dhqa_archive.csv'
# Check if first time creating file, if yes delete existing files with same name
if (os.path.exists(output_file)) and (index_path is 0):
# Check if file exists
if os.path.exists(output_file):
# If file exists append for each loop so that way you don't loose your data
merged_df.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8-sig')
# if file doesn't exist then create it
merged_df.to_csv(output_file, header=True, index=False, encoding='utf-8-sig')
