Last active
July 25, 2021 22:53
-
-
Save ostr00000/ecb7a07480cda4f2ffad18ee1d955aa4 to your computer and use it in GitHub Desktop.
Load all comments from steam workshop (useful to find text in comment).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from dataclasses import dataclass | |
from datetime import datetime | |
from typing import Optional, Iterator, cast | |
import bs4 | |
import requests | |
from bs4 import Tag | |
# config variables: | |
exampleInputUrl = 'https://steamcommunity.com/sharedfiles/filedetails/?id=753498552' | |
# end config | |
_COMMENT_REQUEST_PATTERN = \ | |
'https://steamcommunity.com/comment/PublishedFile_Public/render/{owner}/{feature}/' | |
def extractFromScriptTag(scriptTag: bs4.Tag) -> Optional[str]: | |
js = scriptTag.string.__str__() | |
if not (ownerMatch := next(re.finditer(r'"owner": ?"(\d+)"', js), None)): | |
return | |
if not (feature := next(re.finditer(r'\"feature\": ?\"(\d+)"', js), None)): | |
return | |
return _COMMENT_REQUEST_PATTERN.format(owner=ownerMatch.group(1), feature=feature.group(1)) | |
@dataclass | |
class Comment: | |
autor: str | |
message: str | |
timeStamp: datetime | |
def __str__(self): | |
return f'[{self.timeStamp.isoformat()}] {self.autor:30}: {self.message}' | |
def extractComments(htmlText: str) -> list[Comment]: | |
result = [] | |
bs = bs4.BeautifulSoup(htmlText, "html.parser") | |
for singleComment in bs.find_all(class_='commentthread_comment responsive_body_text'): | |
autor: str = singleComment.find('bdi').text.strip() | |
message: str = singleComment.find( | |
class_='commentthread_comment_text').text.strip() | |
timestamp: str = singleComment.find( | |
class_='commentthread_comment_timestamp')['title'].strip() | |
try: | |
date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p %Z') | |
except ValueError: | |
while not timestamp.endswith(' '): # remove zone | |
timestamp = timestamp[:-1] | |
timestamp = timestamp[:-1] # and trailing space | |
date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p') | |
result.append(Comment(autor, message, date)) | |
return result | |
def progressRange(start, stop, step): | |
for current in range(start, stop, step): | |
print(f"Progress: [{current:{len(str(stop))}}/{stop}={current / stop:0.2f}]") | |
yield current | |
print(f"Progress: [{stop}/{stop}={1.:0.2f}]") | |
def getAllComments(url=None) -> Iterator[Comment]: | |
if url is None: | |
url = exampleInputUrl | |
response = requests.get(url) | |
soup = bs4.BeautifulSoup(response.text, 'html.parser') | |
commentsSection = cast(Tag, soup.find(class_='detailBox')) | |
assert commentsSection, "Cannot find 'detailBox' class" | |
for scriptSection in commentsSection.find_all('script'): | |
if commentUrl := extractFromScriptTag(scriptSection): | |
break | |
else: | |
raise ValueError("Cannot find script id or script section") | |
firstPageResponse = requests.get(commentUrl) | |
jsonResp = firstPageResponse.json() | |
start, step, end = jsonResp['start'], jsonResp['pagesize'], jsonResp['total_count'] | |
yield from extractComments(jsonResp['comments_html']) | |
for start in progressRange(step, end, step): | |
pageResponse = requests.get(commentUrl, params={'start': start}) | |
html = pageResponse.json()['comments_html'] | |
yield from extractComments(html) | |
def main(): | |
url = sys.argv[1] if len(sys.argv) > 1 else None | |
for pageNum, com in enumerate(getAllComments(url)): | |
print(str(com)) | |
if pageNum == 1000: | |
break | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment