Skip to content

Instantly share code, notes, and snippets.

@ws909
Created October 21, 2023 21:19
Show Gist options
  • Save ws909/60a25460963bc0d3c1ff79596cf7bcaf to your computer and use it in GitHub Desktop.
Save ws909/60a25460963bc0d3c1ff79596cf7bcaf to your computer and use it in GitHub Desktop.
Community-disputed posts on Stack Overflow
"""
Finds posts where your voting disagreed with the other site members.
Outputs a list of questions that you have downvoted, but have an overall positive score, as well as questions you have upvoted,
which have either been closed, or have an overall negative score.
This information is not available with StacksAPI, unless every single post on the site is iterated through. Is that faster?
If the filtering can be done server-side, or the total amount of data downloaded is smaller, it definitely should be.
One thing is for sure, though: this script is incredibly slow.
"""
import os.path
from enum import Enum
import requests
import lxml # Faster parser for BS
import cchardet # Faster decoding for BS
from bs4 import BeautifulSoup, SoupStrainer
from getpass import getpass
user_id: int = 0
session: requests.Session = None
class PostType(Enum):
QUESTION = "Q"
ANSWER = "A"
class Vote(Enum):
UPVOTE = +1
DOWNVOTE = -1
class Post:
def __init__(self, post_id: int or str, url: str, post_type: PostType, is_closed: bool, vote_score: int, interaction_date):
self.id: int or str = post_id
self.url: str = url
self.type: PostType = post_type
self.is_closed: bool = is_closed
self.vote_score: int = vote_score
self.self_vote: Vote or None = None
self.interaction_date = interaction_date
def __str__(self):
return f"{self.type.value} {self.id}: {self.url}"
def format_post(post: Post):
closed_text = " closed;" if post.is_closed else ""
return f"{post.type.value} {post.id}; votes: {post.vote_score} | {post.self_vote.value};{closed_text} {post.url}"
def document_of(url, **kwargs):
return BeautifulSoup(session.get(url).text, features="lxml", **kwargs)
def filter_post(post: Post):
if post.self_vote == Vote.UPVOTE and post.is_closed:
return True
return post.vote_score * post.self_vote.value < 0
def scrape_post(post_id, post_type: PostType, interaction_date):
url = f"https://stackoverflow.com/{post_type.value.lower()}/{post_id}"
strainer = SoupStrainer("div", attrs={"id": "mainbar"})
document = document_of(url, parse_only=strainer)
if post_type is PostType.QUESTION:
post = document.find(id="question")
else:
post = document.find(id=f"answer-{post_id}")
if post is None:
raise KeyError(url)
is_closed = False
if post_type is PostType.QUESTION:
try:
is_closed = (post.find(class_="s-prose js-post-body")
.find("aside", class_="s-notice s-notice__info post-notice js-post-notice mb16", string="Closed")
is not None)
except AttributeError:
pass
vote_score = int(post.find(class_="js-vote-count").text)
return Post(post_id, url, post_type, is_closed, vote_score, interaction_date)
def get_posts(filter_function):
for post in scrape_from_tabs():
if not filter_function(post): # TODO: maybe count how many are skipped?
continue
yield post
def scrape_from_tabs():
for post in scrape_from_profile_tab("upvote"):
post.self_vote = Vote.UPVOTE
yield post
for post in scrape_from_profile_tab("downvote"):
post.self_vote = Vote.DOWNVOTE
yield post
def scrape_from_profile_tab(tab: str) -> [Post]:
page_number = 0
try:
while True:
page_number += 1
for post in scrape_vote_links(tab, user_id, page_number):
yield post
except StopIteration:
return
def scrape_vote_links(vote_type: str, user_id: int, page_number: int):
url = f"https://stackoverflow.com/users/{user_id}?tab=votes&sort={vote_type}&page={page_number}"
return scrape_vote_page_links(url)
def scrape_vote_page_links(url: str) -> [Post]:
strainer = SoupStrainer("div", attrs={"id": "user-tab-votes"})
document = document_of(url, parse_only=strainer)
try:
posts_links = document.find(id="user-tab-votes").find(class_="js-expandable-posts").find_all("a")
except AttributeError:
raise StopIteration # Reading past the last page of votes
for entry in posts_links:
components = entry["href"].split("/")
# TODO: also scrape the date! It's vital to know _when_ the vote was cast!
# Will probably have to change `.find_all("a")` to something else, for it.
interaction_date = None
try:
yield scrape_post(int(components[len(components) - 1].split("#")[0]), PostType.ANSWER, interaction_date)
except ValueError:
yield scrape_post(components[2], PostType.QUESTION, interaction_date)
def next_available_file_name(name: str, extension: str):
if not os.path.exists(file_name := name + "." + extension):
return file_name
i = 2
while os.path.exists(file_name := name + f" {i}." + extension):
i += 1
return file_name
if __name__ == '__main__':
# Alternative to username + password:
# https://api.stackexchange.com/docs/authentication (It requires registration of this script on StackApps)
# Asking for credentials is fine for a small script like this.
username = getpass("e-mail: ")
password = getpass("password: ")
login_url = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f"
data = {
"email": username,
"password": password
}
session = requests.Session()
login_response = session.post(login_url, data)
# In case of failure, the URL and request method (POST) remain the same.
if login_response.url != "https://stackoverflow.com/" or login_response.request.method != "GET":
print("The username or password is incorrect")
exit(1)
user_id = int(session.post("https://stackoverflow.com/users/current").url.split("/")[4])
log_file_name = next_available_file_name("Stack Overflow posts", "txt")
with open(log_file_name, "w") as log_file:
for p in get_posts(filter_post):
formatted_post = format_post(p)
print(formatted_post)
log_file.write(formatted_post)
log_file.write("\n")
session.close()
print(f"Output saved to \"{log_file_name}\"")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment