Skip to content

Instantly share code, notes, and snippets.

@JakeSteam
Last active October 19, 2024 21:10
Show Gist options
  • Save JakeSteam/dcc04685ec1c2b516f9abff42ce909fa to your computer and use it in GitHub Desktop.
Save JakeSteam/dcc04685ec1c2b516f9abff42ce909fa to your computer and use it in GitHub Desktop.
Bulk downloading from Archive.org's Wayback Machine and extracting data into CSVs (e.g. StumbleUpon)
import pandas as pd
import csv
df = pd.read_csv('data-parsed/parsed.csv')
df = df.drop_duplicates(subset='id', keep='last')
df.to_csv('data-parsed/parsed-cleaned.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
import os
import csv
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import unquote
root_dir = r'data-raw'
max_files = 9999
stumbleupon_prefix = 'http://www.stumbleupon.com/url/'
output_csv = 'data-parsed/parsed.csv'
def extract_metadata(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
content = file.read()
soup = BeautifulSoup(content, 'lxml')
list_items = soup.find_all(class_='listLi')
metadata_list = []
for item in list_items:
user = item.find(class_='avatar')
reviews = item.find(class_='showReview') or item.find(class_='showStumble')
raw_url = f'https://{reviews.find("a")["href"][len(stumbleupon_prefix):]}'
views = item.find(class_='views')
# Assumes relative path `data-raw\20091011113141\www.stumbleupon.com\discover\toprated\index.html`
path_parts = file_path.split(os.sep)
metadata = {
'id': f"{item.find('var')['class'][0]}_{item.find('var')['id']}",
'url': unquote(unquote(raw_url)),
'title': item.find("span", class_='img').find("img")["alt"].strip(),
'review_count': int(''.join(filter(str.isdigit, reviews.find('a').get_text(strip=True).split()[0]))),
'view_count': int(''.join(filter(str.isdigit, views.find('a')['title'].split()[0]))),
'date': int(path_parts[-5]),
'user_id': int(user['id']) if user else -1,
'user_name': user['title'] if user else 'Unavailable',
'source': path_parts[-2]
}
metadata_list.append(metadata)
return metadata_list
# Ensure the output directory exists
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
# Write metadata to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'url', 'title', 'review_count', 'view_count', 'date', 'user_id', 'user_name', 'source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
writer.writeheader()
# Traverse directories and process HTML files
file_count = 0
for subdir, _, files in os.walk(root_dir):
for file in files:
if file_count >= max_files:
break
file_path = os.path.join(subdir, file)
# Check if the file has no extension or has an .html extension
if not os.path.splitext(file)[1] or file.endswith('.html'):
metadata_list = extract_metadata(file_path)
for data in metadata_list:
writer.writerow(data)
file_count += 1
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"{current_time}: Processed {len(metadata_list)} items from #{file_count}: {file_path}")
if file_count >= max_files:
break
http://www.stumbleupon.com/discover/toprated/
http://www.stumbleupon.com/discover/animation/
http://www.stumbleupon.com/discover/arts/
http://www.stumbleupon.com/discover/bizarre/
http://www.stumbleupon.com/discover/books/
http://www.stumbleupon.com/discover/business/
http://www.stumbleupon.com/discover/cats/
http://www.stumbleupon.com/discover/computer-graphics/
http://www.stumbleupon.com/discover/computers/
http://www.stumbleupon.com/discover/drawing/
http://www.stumbleupon.com/discover/food/
http://www.stumbleupon.com/discover/fun/
http://www.stumbleupon.com/discover/graphic-design/
http://www.stumbleupon.com/discover/health/
http://www.stumbleupon.com/discover/humor/
http://www.stumbleupon.com/discover/internet-tools/
http://www.stumbleupon.com/discover/internet/
http://www.stumbleupon.com/discover/lifestyle/
http://www.stumbleupon.com/discover/movies/
http://www.stumbleupon.com/discover/music/
http://www.stumbleupon.com/discover/news/
http://www.stumbleupon.com/discover/online-games/
http://www.stumbleupon.com/discover/photography/
http://www.stumbleupon.com/discover/politics/
http://www.stumbleupon.com/discover/psychology/
http://www.stumbleupon.com/discover/satire/
http://www.stumbleupon.com/discover/science/
http://www.stumbleupon.com/discover/self-improvement/
http://www.stumbleupon.com/discover/sports/
http://www.stumbleupon.com/discover/technology/
http://www.stumbleupon.com/discover/travel/
http://www.stumbleupon.com/discover/videos/
waybackpack ^
http://www.stumbleupon.com/discover/videos/ ^
-d "/Users/jake_/Documents/Projects/StumbleUpon-extract/data-raw" ^
--from-date 20091001 ^
--to-date 20120318 ^
--raw ^
--no-clobber ^
--delay-retry 2 ^
--max-retries 10 ^
--ignore-errors ^
--user-agent "waybackpack x@example.com"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment