ws909/posts.py

## posts.py
"""
Finds posts where your voting disagreed with the other site members.

Outputs a list of questions that you have downvoted, but have an overall positive score, as well as questions you have upvoted,
which have either been closed, or have an overall negative score.

This information is not available with StacksAPI, unless every single post on the site is iterated through. Is that faster?
If the filtering can be done server-side, or the total amount of data downloaded is smaller, it definitely should be.

One thing is for sure, though: this script is incredibly slow.
"""
import os.path
from enum import Enum

import requests
import lxml  # Faster parser for BS
import cchardet  # Faster decoding for BS
from bs4 import BeautifulSoup, SoupStrainer
from getpass import getpass

user_id: int = 0
session: requests.Session = None


class PostType(Enum):
    QUESTION = "Q"
    ANSWER = "A"


class Vote(Enum):
    UPVOTE = +1
    DOWNVOTE = -1


class Post:

    def __init__(self, post_id: int or str, url: str, post_type: PostType, is_closed: bool, vote_score: int, interaction_date):
        self.id: int or str = post_id
        self.url: str = url
        self.type: PostType = post_type
        self.is_closed: bool = is_closed
        self.vote_score: int = vote_score
        self.self_vote: Vote or None = None
        self.interaction_date = interaction_date

    def __str__(self):
        return f"{self.type.value} {self.id}: {self.url}"


def format_post(post: Post):
    closed_text = " closed;" if post.is_closed else ""
    return f"{post.type.value} {post.id}; votes: {post.vote_score} | {post.self_vote.value};{closed_text} {post.url}"


def document_of(url, **kwargs):
    return BeautifulSoup(session.get(url).text, features="lxml", **kwargs)


def filter_post(post: Post):
    if post.self_vote == Vote.UPVOTE and post.is_closed:
        return True

    return post.vote_score * post.self_vote.value < 0


def scrape_post(post_id, post_type: PostType, interaction_date):
    url = f"https://stackoverflow.com/{post_type.value.lower()}/{post_id}"

    strainer = SoupStrainer("div", attrs={"id": "mainbar"})
    document = document_of(url, parse_only=strainer)

    if post_type is PostType.QUESTION:
        post = document.find(id="question")
    else:
        post = document.find(id=f"answer-{post_id}")

    if post is None:
        raise KeyError(url)

    is_closed = False

    if post_type is PostType.QUESTION:
        try:
            is_closed = (post.find(class_="s-prose js-post-body")
                         .find("aside", class_="s-notice s-notice__info post-notice js-post-notice mb16", string="Closed")
                         is not None)
        except AttributeError:
            pass

    vote_score = int(post.find(class_="js-vote-count").text)

    return Post(post_id, url, post_type, is_closed, vote_score, interaction_date)


def get_posts(filter_function):
    for post in scrape_from_tabs():
        if not filter_function(post):  # TODO: maybe count how many are skipped?
            continue

        yield post


def scrape_from_tabs():
    for post in scrape_from_profile_tab("upvote"):
        post.self_vote = Vote.UPVOTE
        yield post

    for post in scrape_from_profile_tab("downvote"):
        post.self_vote = Vote.DOWNVOTE
        yield post


def scrape_from_profile_tab(tab: str) -> [Post]:
    page_number = 0

    try:
        while True:
            page_number += 1
            for post in scrape_vote_links(tab, user_id, page_number):
                yield post
    except StopIteration:
        return


def scrape_vote_links(vote_type: str, user_id: int, page_number: int):
    url = f"https://stackoverflow.com/users/{user_id}?tab=votes&sort={vote_type}&page={page_number}"

    return scrape_vote_page_links(url)


def scrape_vote_page_links(url: str) -> [Post]:
    strainer = SoupStrainer("div", attrs={"id": "user-tab-votes"})
    document = document_of(url, parse_only=strainer)

    try:
        posts_links = document.find(id="user-tab-votes").find(class_="js-expandable-posts").find_all("a")
    except AttributeError:
        raise StopIteration  # Reading past the last page of votes

    for entry in posts_links:
        components = entry["href"].split("/")

        # TODO: also scrape the date! It's vital to know _when_ the vote was cast!
        #  Will probably have to change `.find_all("a")` to something else, for it.
        interaction_date = None
        try:
            yield scrape_post(int(components[len(components) - 1].split("#")[0]), PostType.ANSWER, interaction_date)
        except ValueError:
            yield scrape_post(components[2], PostType.QUESTION, interaction_date)


def next_available_file_name(name: str, extension: str):
    if not os.path.exists(file_name := name + "." + extension):
        return file_name

    i = 2
    while os.path.exists(file_name := name + f" {i}." + extension):
        i += 1

    return file_name


if __name__ == '__main__':
    # Alternative to username + password:
    # https://api.stackexchange.com/docs/authentication (It requires registration of this script on StackApps)

    # Asking for credentials is fine for a small script like this.
    username = getpass("e-mail: ")
    password = getpass("password: ")

    login_url = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f"
    data = {
        "email": username,
        "password": password
    }

    session = requests.Session()
    login_response = session.post(login_url, data)

    # In case of failure, the URL and request method (POST) remain the same.
    if login_response.url != "https://stackoverflow.com/" or login_response.request.method != "GET":
        print("The username or password is incorrect")
        exit(1)

    user_id = int(session.post("https://stackoverflow.com/users/current").url.split("/")[4])

    log_file_name = next_available_file_name("Stack Overflow posts", "txt")

    with open(log_file_name, "w") as log_file:
        for p in get_posts(filter_post):
            formatted_post = format_post(p)
            print(formatted_post)
            log_file.write(formatted_post)
            log_file.write("\n")

    session.close()
    print(f"Output saved to \"{log_file_name}\"")
	"""
	Finds posts where your voting disagreed with the other site members.

	Outputs a list of questions that you have downvoted, but have an overall positive score, as well as questions you have upvoted,
	which have either been closed, or have an overall negative score.

	This information is not available with StacksAPI, unless every single post on the site is iterated through. Is that faster?
	If the filtering can be done server-side, or the total amount of data downloaded is smaller, it definitely should be.

	One thing is for sure, though: this script is incredibly slow.
	"""
	import os.path
	from enum import Enum

	import requests
	import lxml # Faster parser for BS
	import cchardet # Faster decoding for BS
	from bs4 import BeautifulSoup, SoupStrainer
	from getpass import getpass

	user_id: int = 0
	session: requests.Session = None


	class PostType(Enum):
	QUESTION = "Q"
	ANSWER = "A"


	class Vote(Enum):
	UPVOTE = +1
	DOWNVOTE = -1


	class Post:

	def __init__(self, post_id: int or str, url: str, post_type: PostType, is_closed: bool, vote_score: int, interaction_date):
	self.id: int or str = post_id
	self.url: str = url
	self.type: PostType = post_type
	self.is_closed: bool = is_closed
	self.vote_score: int = vote_score
	self.self_vote: Vote or None = None
	self.interaction_date = interaction_date

	def __str__(self):
	return f"{self.type.value} {self.id}: {self.url}"


	def format_post(post: Post):
	closed_text = " closed;" if post.is_closed else ""
	return f"{post.type.value} {post.id}; votes: {post.vote_score} \| {post.self_vote.value};{closed_text} {post.url}"


	def document_of(url, **kwargs):
	return BeautifulSoup(session.get(url).text, features="lxml", **kwargs)


	def filter_post(post: Post):
	if post.self_vote == Vote.UPVOTE and post.is_closed:
	return True

	return post.vote_score * post.self_vote.value < 0


	def scrape_post(post_id, post_type: PostType, interaction_date):
	url = f"https://stackoverflow.com/{post_type.value.lower()}/{post_id}"

	strainer = SoupStrainer("div", attrs={"id": "mainbar"})
	document = document_of(url, parse_only=strainer)

	if post_type is PostType.QUESTION:
	post = document.find(id="question")
	else:
	post = document.find(id=f"answer-{post_id}")

	if post is None:
	raise KeyError(url)

	is_closed = False

	if post_type is PostType.QUESTION:
	try:
	is_closed = (post.find(class_="s-prose js-post-body")
	.find("aside", class_="s-notice s-notice__info post-notice js-post-notice mb16", string="Closed")
	is not None)
	except AttributeError:
	pass

	vote_score = int(post.find(class_="js-vote-count").text)

	return Post(post_id, url, post_type, is_closed, vote_score, interaction_date)


	def get_posts(filter_function):
	for post in scrape_from_tabs():
	if not filter_function(post): # TODO: maybe count how many are skipped?
	continue

	yield post


	def scrape_from_tabs():
	for post in scrape_from_profile_tab("upvote"):
	post.self_vote = Vote.UPVOTE
	yield post

	for post in scrape_from_profile_tab("downvote"):
	post.self_vote = Vote.DOWNVOTE
	yield post


	def scrape_from_profile_tab(tab: str) -> [Post]:
	page_number = 0

	try:
	while True:
	page_number += 1
	for post in scrape_vote_links(tab, user_id, page_number):
	yield post
	except StopIteration:
	return


	def scrape_vote_links(vote_type: str, user_id: int, page_number: int):
	url = f"https://stackoverflow.com/users/{user_id}?tab=votes&sort={vote_type}&page={page_number}"

	return scrape_vote_page_links(url)


	def scrape_vote_page_links(url: str) -> [Post]:
	strainer = SoupStrainer("div", attrs={"id": "user-tab-votes"})
	document = document_of(url, parse_only=strainer)

	try:
	posts_links = document.find(id="user-tab-votes").find(class_="js-expandable-posts").find_all("a")
	except AttributeError:
	raise StopIteration # Reading past the last page of votes

	for entry in posts_links:
	components = entry["href"].split("/")

	# TODO: also scrape the date! It's vital to know _when_ the vote was cast!
	# Will probably have to change `.find_all("a")` to something else, for it.
	interaction_date = None
	try:
	yield scrape_post(int(components[len(components) - 1].split("#")[0]), PostType.ANSWER, interaction_date)
	except ValueError:
	yield scrape_post(components[2], PostType.QUESTION, interaction_date)


	def next_available_file_name(name: str, extension: str):
	if not os.path.exists(file_name := name + "." + extension):
	return file_name

	i = 2
	while os.path.exists(file_name := name + f" {i}." + extension):
	i += 1

	return file_name


	if __name__ == '__main__':
	# Alternative to username + password:
	# https://api.stackexchange.com/docs/authentication (It requires registration of this script on StackApps)

	# Asking for credentials is fine for a small script like this.
	username = getpass("e-mail: ")
	password = getpass("password: ")

	login_url = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f"
	data = {
	"email": username,
	"password": password
	}

	session = requests.Session()
	login_response = session.post(login_url, data)

	# In case of failure, the URL and request method (POST) remain the same.
	if login_response.url != "https://stackoverflow.com/" or login_response.request.method != "GET":
	print("The username or password is incorrect")
	exit(1)

	user_id = int(session.post("https://stackoverflow.com/users/current").url.split("/")[4])

	log_file_name = next_available_file_name("Stack Overflow posts", "txt")

	with open(log_file_name, "w") as log_file:
	for p in get_posts(filter_post):
	formatted_post = format_post(p)
	print(formatted_post)
	log_file.write(formatted_post)
	log_file.write("\n")

	session.close()
	print(f"Output saved to \"{log_file_name}\"")