Skip to content

Instantly share code, notes, and snippets.

Last active January 10, 2022 03:48
Show Gist options
  • Save AndrewPardoe/8701800c118e1945a4ea63a65f7f7acb to your computer and use it in GitHub Desktop.
Save AndrewPardoe/8701800c118e1945a4ea63a65f7f7acb to your computer and use it in GitHub Desktop.
Scrape flat list of UserVoice Ideas with links from a category's UserVoice pages
# UserVoice doesn't have a search capability that will filter on category. I only care about my category (C++) in a huge
# Visual Studio database. This script scrapes all UserVoice suggestions in my category with links into an HTML document.
# Improvements welcome from those who actually know Python--this is the first Python script I've ever needed to write.
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
# Whack any Unicode characters when printing to file. Not correct, but not crashing.
def safewrite(file, string):
except UnicodeEncodeError:
for char in string:
except UnicodeEncodeError:
# Specifics of my UserVoice page and category
prefix = ''
firstPage = prefix + '/forums/121579-visual-studio-ide/category/30937-languages-c'
nextPage = prefix + '/forums/121579-visual-studio-ide/category/30937-languages-c/filters/top?page={}'
# Ideas are split across many pages. Find all page numbers from the Pagination control.
soup = BeautifulSoup(requests.get(firstPage).content, "lxml")
pagination = soup.find("div", attrs={'class':'uvPagination'})
def pageeq(href):
return href and re.compile("page=").search(href)
# Loop through and find the last page number. Could probably write a better regex above.
i = 0
for pageref in pagination.find_all(href=pageeq):
x = redigits.findall(pageref.string)
if x:
i = x[0]
upper = int(i) + 1
# Create a local HTML page with a list of my category's UserVoice links
outfile = open('UserVoice.html', 'w')
outfile.write("<html>\n<head>\n<title>UserVoice items from {0}</title>\n</head>\n".format(firstPage))
outfile.write("<body>\n<h2>UserVoice items from {0}</h2>\n<ul>\n".format(firstPage))
# Run through every page, find the links, print the prefix, link, and link title
for page in range(1, upper):
print ("Processing page {0} of {1}".format(page, upper - 1))
soup = BeautifulSoup(requests.get(nextPage.format(page)).content, "lxml")
for header in soup.find_all("h2", class_="uvIdeaTitle"):
outfile.write("\t<li><a href=\"{0}{1}\">".format(prefix, header.a.get('href')))
safewrite(outfile, header.a.string)
# Close out the HTML page
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment