ostr00000/fetch_steam_comments.py

## fetch_steam_comments.py
import re
import sys
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Iterator, cast

import bs4
import requests
from bs4 import Tag

# config variables:
exampleInputUrl = 'https://steamcommunity.com/sharedfiles/filedetails/?id=753498552'
# end config


_COMMENT_REQUEST_PATTERN = \
    'https://steamcommunity.com/comment/PublishedFile_Public/render/{owner}/{feature}/'


def extractFromScriptTag(scriptTag: bs4.Tag) -> Optional[str]:
    js = scriptTag.string.__str__()
    if not (ownerMatch := next(re.finditer(r'"owner": ?"(\d+)"', js), None)):
        return
    if not (feature := next(re.finditer(r'\"feature\": ?\"(\d+)"', js), None)):
        return

    return _COMMENT_REQUEST_PATTERN.format(owner=ownerMatch.group(1), feature=feature.group(1))


@dataclass
class Comment:
    autor: str
    message: str
    timeStamp: datetime

    def __str__(self):
        return f'[{self.timeStamp.isoformat()}] {self.autor:30}: {self.message}'


def extractComments(htmlText: str) -> list[Comment]:
    result = []
    bs = bs4.BeautifulSoup(htmlText, "html.parser")
    for singleComment in bs.find_all(class_='commentthread_comment responsive_body_text'):
        autor: str = singleComment.find('bdi').text.strip()
        message: str = singleComment.find(
            class_='commentthread_comment_text').text.strip()
        timestamp: str = singleComment.find(
            class_='commentthread_comment_timestamp')['title'].strip()

        try:
            date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p %Z')
        except ValueError:
            while not timestamp.endswith(' '):  # remove zone
                timestamp = timestamp[:-1]
            timestamp = timestamp[:-1]  # and trailing space
            date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p')

        result.append(Comment(autor, message, date))
    return result


def progressRange(start, stop, step):
    for current in range(start, stop, step):
        print(f"Progress: [{current:{len(str(stop))}}/{stop}={current / stop:0.2f}]")
        yield current
    print(f"Progress: [{stop}/{stop}={1.:0.2f}]")


def getAllComments(url=None) -> Iterator[Comment]:
    if url is None:
        url = exampleInputUrl

    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    commentsSection = cast(Tag, soup.find(class_='detailBox'))
    assert commentsSection, "Cannot find 'detailBox' class"

    for scriptSection in commentsSection.find_all('script'):
        if commentUrl := extractFromScriptTag(scriptSection):
            break
    else:
        raise ValueError("Cannot find script id or script section")

    firstPageResponse = requests.get(commentUrl)
    jsonResp = firstPageResponse.json()
    start, step, end = jsonResp['start'], jsonResp['pagesize'], jsonResp['total_count']

    yield from extractComments(jsonResp['comments_html'])
    for start in progressRange(step, end, step):
        pageResponse = requests.get(commentUrl, params={'start': start})
        html = pageResponse.json()['comments_html']
        yield from extractComments(html)


def main():
    url = sys.argv[1] if len(sys.argv) > 1 else None

    for pageNum, com in enumerate(getAllComments(url)):
        print(str(com))
        if pageNum == 1000:
            break


if __name__ == '__main__':
    main()
	import re
	import sys
	from dataclasses import dataclass
	from datetime import datetime
	from typing import Optional, Iterator, cast

	import bs4
	import requests
	from bs4 import Tag

	# config variables:
	exampleInputUrl = 'https://steamcommunity.com/sharedfiles/filedetails/?id=753498552'
	# end config


	_COMMENT_REQUEST_PATTERN = \
	'https://steamcommunity.com/comment/PublishedFile_Public/render/{owner}/{feature}/'


	def extractFromScriptTag(scriptTag: bs4.Tag) -> Optional[str]:
	js = scriptTag.string.__str__()
	if not (ownerMatch := next(re.finditer(r'"owner": ?"(\d+)"', js), None)):
	return
	if not (feature := next(re.finditer(r'\"feature\": ?\"(\d+)"', js), None)):
	return

	return _COMMENT_REQUEST_PATTERN.format(owner=ownerMatch.group(1), feature=feature.group(1))


	@dataclass
	class Comment:
	autor: str
	message: str
	timeStamp: datetime

	def __str__(self):
	return f'[{self.timeStamp.isoformat()}] {self.autor:30}: {self.message}'


	def extractComments(htmlText: str) -> list[Comment]:
	result = []
	bs = bs4.BeautifulSoup(htmlText, "html.parser")
	for singleComment in bs.find_all(class_='commentthread_comment responsive_body_text'):
	autor: str = singleComment.find('bdi').text.strip()
	message: str = singleComment.find(
	class_='commentthread_comment_text').text.strip()
	timestamp: str = singleComment.find(
	class_='commentthread_comment_timestamp')['title'].strip()

	try:
	date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p %Z')
	except ValueError:
	while not timestamp.endswith(' '): # remove zone
	timestamp = timestamp[:-1]
	timestamp = timestamp[:-1] # and trailing space
	date = datetime.strptime(timestamp, '%d %B, %Y @ %I:%M:%S %p')

	result.append(Comment(autor, message, date))
	return result


	def progressRange(start, stop, step):
	for current in range(start, stop, step):
	print(f"Progress: [{current:{len(str(stop))}}/{stop}={current / stop:0.2f}]")
	yield current
	print(f"Progress: [{stop}/{stop}={1.:0.2f}]")


	def getAllComments(url=None) -> Iterator[Comment]:
	if url is None:
	url = exampleInputUrl

	response = requests.get(url)
	soup = bs4.BeautifulSoup(response.text, 'html.parser')
	commentsSection = cast(Tag, soup.find(class_='detailBox'))
	assert commentsSection, "Cannot find 'detailBox' class"

	for scriptSection in commentsSection.find_all('script'):
	if commentUrl := extractFromScriptTag(scriptSection):
	break
	else:
	raise ValueError("Cannot find script id or script section")

	firstPageResponse = requests.get(commentUrl)
	jsonResp = firstPageResponse.json()
	start, step, end = jsonResp['start'], jsonResp['pagesize'], jsonResp['total_count']

	yield from extractComments(jsonResp['comments_html'])
	for start in progressRange(step, end, step):
	pageResponse = requests.get(commentUrl, params={'start': start})
	html = pageResponse.json()['comments_html']
	yield from extractComments(html)


	def main():
	url = sys.argv[1] if len(sys.argv) > 1 else None

	for pageNum, com in enumerate(getAllComments(url)):
	print(str(com))
	if pageNum == 1000:
	break


	if __name__ == '__main__':
	main()