Skip to content

Instantly share code, notes, and snippets.

@ostr00000
Last active July 25, 2021 22:53
Show Gist options
  • Save ostr00000/ecb7a07480cda4f2ffad18ee1d955aa4 to your computer and use it in GitHub Desktop.
Save ostr00000/ecb7a07480cda4f2ffad18ee1d955aa4 to your computer and use it in GitHub Desktop.
Load all comments from steam workshop (useful to find text in comment).
import re
import sys
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Iterator, cast
import bs4
import requests
from bs4 import Tag
# config variables:
exampleInputUrl = 'https://steamcommunity.com/sharedfiles/filedetails/?id=753498552'
# end config
_COMMENT_REQUEST_PATTERN = \
'https://steamcommunity.com/comment/PublishedFile_Public/render/{owner}/{feature}/'
def extractFromScriptTag(scriptTag: bs4.Tag) -> Optional[str]:
js = scriptTag.string.__str__()
if not (ownerMatch := next(re.finditer(r'"owner": ?"(\d+)"', js), None)):
return
if not (feature := next(re.finditer(r'\"feature\": ?\"(\d+)"', js), None)):
return
return _COMMENT_REQUEST_PATTERN.format(owner=ownerMatch.group(1), feature=feature.group(1))
@dataclass
class Comment:
autor: str
message: str
timeStamp: datetime
def __str__(self):
return f'[{self.timeStamp.isoformat()}] {self.autor:30}: {self.message}'
def extractComments(htmlText: str) -> list[Comment]:
result = []
bs = bs4.BeautifulSoup(htmlText, "html.parser")
for singleComment in bs.find_all(class_='commentthread_comment responsive_body_text'):
autor: str = singleComment.find('bdi').text.strip()
message: str = singleComment.find(
class_='commentthread_comment_text').text.strip()
timestamp: str = singleComment.find(
class_='commentthread_comment_timestamp')['title'].strip()
try:
date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p %Z')
except ValueError:
while not timestamp.endswith(' '): # remove zone
timestamp = timestamp[:-1]
timestamp = timestamp[:-1] # and trailing space
date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p')
result.append(Comment(autor, message, date))
return result
def progressRange(start, stop, step):
for current in range(start, stop, step):
print(f"Progress: [{current:{len(str(stop))}}/{stop}={current / stop:0.2f}]")
yield current
print(f"Progress: [{stop}/{stop}={1.:0.2f}]")
def getAllComments(url=None) -> Iterator[Comment]:
if url is None:
url = exampleInputUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
commentsSection = cast(Tag, soup.find(class_='detailBox'))
assert commentsSection, "Cannot find 'detailBox' class"
for scriptSection in commentsSection.find_all('script'):
if commentUrl := extractFromScriptTag(scriptSection):
break
else:
raise ValueError("Cannot find script id or script section")
firstPageResponse = requests.get(commentUrl)
jsonResp = firstPageResponse.json()
start, step, end = jsonResp['start'], jsonResp['pagesize'], jsonResp['total_count']
yield from extractComments(jsonResp['comments_html'])
for start in progressRange(step, end, step):
pageResponse = requests.get(commentUrl, params={'start': start})
html = pageResponse.json()['comments_html']
yield from extractComments(html)
def main():
url = sys.argv[1] if len(sys.argv) > 1 else None
for pageNum, com in enumerate(getAllComments(url)):
print(str(com))
if pageNum == 1000:
break
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment