Created
October 21, 2023 21:19
-
-
Save ws909/60a25460963bc0d3c1ff79596cf7bcaf to your computer and use it in GitHub Desktop.
Community-disputed posts on Stack Overflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Finds posts where your voting disagreed with the other site members. | |
Outputs a list of questions that you have downvoted, but have an overall positive score, as well as questions you have upvoted, | |
which have either been closed, or have an overall negative score. | |
This information is not available with StacksAPI, unless every single post on the site is iterated through. Is that faster? | |
If the filtering can be done server-side, or the total amount of data downloaded is smaller, it definitely should be. | |
One thing is for sure, though: this script is incredibly slow. | |
""" | |
import os.path | |
from enum import Enum | |
import requests | |
import lxml # Faster parser for BS | |
import cchardet # Faster decoding for BS | |
from bs4 import BeautifulSoup, SoupStrainer | |
from getpass import getpass | |
user_id: int = 0 | |
session: requests.Session = None | |
class PostType(Enum): | |
QUESTION = "Q" | |
ANSWER = "A" | |
class Vote(Enum): | |
UPVOTE = +1 | |
DOWNVOTE = -1 | |
class Post: | |
def __init__(self, post_id: int or str, url: str, post_type: PostType, is_closed: bool, vote_score: int, interaction_date): | |
self.id: int or str = post_id | |
self.url: str = url | |
self.type: PostType = post_type | |
self.is_closed: bool = is_closed | |
self.vote_score: int = vote_score | |
self.self_vote: Vote or None = None | |
self.interaction_date = interaction_date | |
def __str__(self): | |
return f"{self.type.value} {self.id}: {self.url}" | |
def format_post(post: Post): | |
closed_text = " closed;" if post.is_closed else "" | |
return f"{post.type.value} {post.id}; votes: {post.vote_score} | {post.self_vote.value};{closed_text} {post.url}" | |
def document_of(url, **kwargs): | |
return BeautifulSoup(session.get(url).text, features="lxml", **kwargs) | |
def filter_post(post: Post): | |
if post.self_vote == Vote.UPVOTE and post.is_closed: | |
return True | |
return post.vote_score * post.self_vote.value < 0 | |
def scrape_post(post_id, post_type: PostType, interaction_date): | |
url = f"https://stackoverflow.com/{post_type.value.lower()}/{post_id}" | |
strainer = SoupStrainer("div", attrs={"id": "mainbar"}) | |
document = document_of(url, parse_only=strainer) | |
if post_type is PostType.QUESTION: | |
post = document.find(id="question") | |
else: | |
post = document.find(id=f"answer-{post_id}") | |
if post is None: | |
raise KeyError(url) | |
is_closed = False | |
if post_type is PostType.QUESTION: | |
try: | |
is_closed = (post.find(class_="s-prose js-post-body") | |
.find("aside", class_="s-notice s-notice__info post-notice js-post-notice mb16", string="Closed") | |
is not None) | |
except AttributeError: | |
pass | |
vote_score = int(post.find(class_="js-vote-count").text) | |
return Post(post_id, url, post_type, is_closed, vote_score, interaction_date) | |
def get_posts(filter_function): | |
for post in scrape_from_tabs(): | |
if not filter_function(post): # TODO: maybe count how many are skipped? | |
continue | |
yield post | |
def scrape_from_tabs(): | |
for post in scrape_from_profile_tab("upvote"): | |
post.self_vote = Vote.UPVOTE | |
yield post | |
for post in scrape_from_profile_tab("downvote"): | |
post.self_vote = Vote.DOWNVOTE | |
yield post | |
def scrape_from_profile_tab(tab: str) -> [Post]: | |
page_number = 0 | |
try: | |
while True: | |
page_number += 1 | |
for post in scrape_vote_links(tab, user_id, page_number): | |
yield post | |
except StopIteration: | |
return | |
def scrape_vote_links(vote_type: str, user_id: int, page_number: int): | |
url = f"https://stackoverflow.com/users/{user_id}?tab=votes&sort={vote_type}&page={page_number}" | |
return scrape_vote_page_links(url) | |
def scrape_vote_page_links(url: str) -> [Post]: | |
strainer = SoupStrainer("div", attrs={"id": "user-tab-votes"}) | |
document = document_of(url, parse_only=strainer) | |
try: | |
posts_links = document.find(id="user-tab-votes").find(class_="js-expandable-posts").find_all("a") | |
except AttributeError: | |
raise StopIteration # Reading past the last page of votes | |
for entry in posts_links: | |
components = entry["href"].split("/") | |
# TODO: also scrape the date! It's vital to know _when_ the vote was cast! | |
# Will probably have to change `.find_all("a")` to something else, for it. | |
interaction_date = None | |
try: | |
yield scrape_post(int(components[len(components) - 1].split("#")[0]), PostType.ANSWER, interaction_date) | |
except ValueError: | |
yield scrape_post(components[2], PostType.QUESTION, interaction_date) | |
def next_available_file_name(name: str, extension: str): | |
if not os.path.exists(file_name := name + "." + extension): | |
return file_name | |
i = 2 | |
while os.path.exists(file_name := name + f" {i}." + extension): | |
i += 1 | |
return file_name | |
if __name__ == '__main__': | |
# Alternative to username + password: | |
# https://api.stackexchange.com/docs/authentication (It requires registration of this script on StackApps) | |
# Asking for credentials is fine for a small script like this. | |
username = getpass("e-mail: ") | |
password = getpass("password: ") | |
login_url = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f" | |
data = { | |
"email": username, | |
"password": password | |
} | |
session = requests.Session() | |
login_response = session.post(login_url, data) | |
# In case of failure, the URL and request method (POST) remain the same. | |
if login_response.url != "https://stackoverflow.com/" or login_response.request.method != "GET": | |
print("The username or password is incorrect") | |
exit(1) | |
user_id = int(session.post("https://stackoverflow.com/users/current").url.split("/")[4]) | |
log_file_name = next_available_file_name("Stack Overflow posts", "txt") | |
with open(log_file_name, "w") as log_file: | |
for p in get_posts(filter_post): | |
formatted_post = format_post(p) | |
print(formatted_post) | |
log_file.write(formatted_post) | |
log_file.write("\n") | |
session.close() | |
print(f"Output saved to \"{log_file_name}\"") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment