Skip to content

Instantly share code, notes, and snippets.

@dtlnor
Last active August 30, 2023 08:27
Show Gist options
  • Save dtlnor/9e722a342f6aef5f1d36ad311f3e6dc6 to your computer and use it in GitHub Desktop.
Save dtlnor/9e722a342f6aef5f1d36ad311f3e6dc6 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import json
import requests
import datetime
import dateutil.parser
import html
from pathlib import Path
import os
import unicodedata
# GLOBAL
archivePath : str = os.path.join(os.getcwd(),"Archive")
headers = ""
liveurl = ""
cookie = ""
XCsrfToken = ""
def translate_valid_filename(name):
half_to_full = {
'<' : '<',
'>' : '>',
':' : ':',
'/' : '/',
'|' : '|',
'?' : '?',
'\\' : '\',
'\"' : '"',
'\t' : ' '
}
s = unicodedata.normalize('NFKC', name)
s = str(s).strip()
for k, v in half_to_full.items():
s = s.replace(k, v)
# turn to full-width
# return s.translate(half_to_full)
return s
def saveFileFromLink(link: str, liveFolderName: str, subFolderName: str = "", localFilename: str = ""):
if not link.startswith("http") :
if (link.startswith("/static/img/live/gift")):
link = "https://www.zan-live.com"+link
elif (link.startswith("/image")):
link = "https://www.zan-live.com"+link
else:
print("invalid link: "+link)
return ""
if "?" in link:
print(f"save \"{link}\" -> \"{link.split('?')[0]}\"")
link = link.split('?')[0]
default = ["5_l0bykluf", "25_l56d6km7", "25_l56d6qpm", "5_l06cmoz4","25_l2o34nth","25_l4dxddjt",
"icon_50000b.svg","icon_30000b.svg","icon_10000b.svg",
"icon_5000b.svg","icon_3000b.svg","icon_1500b.svg",
"icon_1000b.svg","icon_500b.svg","icon_100b.svg",
"icon_50000.svg","icon_30000.svg","icon_10000.svg",
"icon_5000.svg","icon_3000.svg","icon_1500.svg",
"icon_1000.svg","icon_500.svg","icon_100.svg","findme-s_eb4c4e77-a080-44e8-a55b-7b2442e8ed5a"]
if (True in [sample in link for sample in default]):
print("default picture skiped: "+link)
return
if subFolderName != "":
fullFolderPath = os.path.join(
archivePath, liveFolderName, subFolderName)
else:
fullFolderPath = os.path.join(archivePath, liveFolderName)
Path(fullFolderPath).mkdir(parents=True, exist_ok=True)
if localFilename == "":
localFilename = link.split("/")[-1]
with open(os.path.join(fullFolderPath, localFilename), 'wb') as f:
downloaded = requests.get(link)
f.write(downloaded.content)
return os.path.join(fullFolderPath, localFilename)
def translateTs(value) -> datetime.datetime:
return datetime.datetime.fromtimestamp(float(value)/1000.0)
def isoToDatetime(value: str) -> datetime.datetime:
return dateutil.parser.isoparse(value)
def giftJsonUnescape(value: str):
giftJson = json.loads(value)
for gift in giftJson:
gift["name"] = html.unescape(gift["name"])\
.replace(u"\u00A0", " ") if gift["name"] else gift["name"]
return giftJson
def downloadJsonFromIndex(filepath: str, liveFolderName: str, subFolderName: str, ):
commentList = list()
# commentDict = {"main": None, "others": None}
with open(filepath, encoding="UTF-8", mode="r") as f:
d = json.load(f)
for path in d["comments"].keys():
commentList.append(saveFileFromLink(
link=str(path), liveFolderName=liveFolderName, subFolderName=subFolderName))
# commentDict["main"] = commentMainList
if "others" in d:
commentList.append(saveFileFromLink(
link=str(d["others"]), liveFolderName=liveFolderName, subFolderName=subFolderName))
return commentList
def downloadJsonFromManifest(link: str, liveFolderName: str, subFolderName: str):
commentList = list()
manifestFile = saveFileFromLink(link, liveFolderName, subFolderName)
with open(manifestFile, encoding="UTF-8") as f:
d = json.load(f)
count = d["comment"]["lastIndex"]
countsp = d["special"]["lastIndex"]
mainfilename = str(d["comment"]["list"][-1]["url"]).split("/")[-1]
spfilename = str(d["special"]["list"][-1]["url"]).split("/")[-1]
mainurl = str(d["comment"]["list"][-1]["url"]).removesuffix(mainfilename)
spurl = str(d["special"]["list"][-1]["url"]).removesuffix(spfilename)
for i in range(1,count+1):
url = mainurl + str(i) + ".json"
commentList.append(saveFileFromLink(url, liveFolderName, subFolderName))
for i in range(1,countsp+1):
url = spurl + f"specials_{i}.json"
commentList.append(saveFileFromLink(url, liveFolderName, subFolderName))
return commentList
hardcodeUser = {
"1.3p-.J":"Isekaijoucho",
"1.39w.U":"理芽",
"1.3pr.V":"CIEL_VanillaSky",
"1.1JL.L":"操桃",
"1.19-.I":"理芽",
"1.1hu.V":"花譜",
"1.1Te.2":"PIEDPIPER",
"1._jR.E":"VESPERBELL",
"1.8K.U":"UMAFF",
"1.OYf.j":"GEMSCOMPANY公式アカウント",
"1.wKh.L":"赤羽ユキノ",
"1.dcB.R":"小瀬戸らむ",
"1.dlb.j":"一文字マヤ",
"1._SU.e":"音羽雫",
"1.wKE.Z":"有栖川レイカ",
"1.wKH.R":"長谷みこと",
"1.ObO.E":"奈日抽ねね",
"1.dlS.Z":"星菜日向夏",
"1.Owv.s":"KYOHEINISHINO",
"1.1NX.E":"素直",
"1.xME.z":"存流",
"1.xMZ.m":"明透",
"1.Phv.y":"livecartoon"
}
def filterMainComment(filenameList: list, adminUserId: str = ""):
filteredCommentList = []
for filepath in filenameList:
with open(filepath, encoding="UTF-8", mode="r") as f:
d = json.load(f)
for comment in d:
if (comment["type"] == 5 or comment["admin_name"] != None):
user_name = comment["user_id"]
if XCsrfToken != "":
response = requests.post(url=R"https://www.zan-live.com/api/user/getProfileList",
headers={ "Cookie": cookie, "X-Csrf-Token": XCsrfToken },
data={"userIds[]":user_name})
try:
user_name = response.json()["result"]["profileList"][0]["userName"]
if comment["user_id"] not in hardcodeUser.keys() or user_name not in hardcodeUser.values():
print(f"\tnew Admin User: \"{comment['user_id']}\" = \"{user_name}\"")
except:
print("Fail to get User Profile, fallback to use hardcode user list.")
user_name = hardcodeUser.get(user_name, user_name)
timestamp = comment["created_at"]
# as the timestamp in comments is at UTC + 0
timestamp = isoToDatetime(timestamp) + datetime.timedelta(hours=9)
filteredCommentList.append({
"user_name": user_name,
"ts": timestamp,
"text": comment["content"]["text"]
})
return filteredCommentList
def procGifts(soup : BeautifulSoup, metaName: str, liveFolderName: str, subFolderName: str = ""):
giftElement = soup.find("meta", attrs={"name": metaName})
if giftElement is None:
print(f"No {metaName}")
return
giftElementJson = giftJsonUnescape(giftElement["content"])
if subFolderName != "":
fullFolderPath = os.path.join(
archivePath, liveFolderName, subFolderName)
else:
fullFolderPath = os.path.join(archivePath, liveFolderName)
Path(fullFolderPath).mkdir(parents=True, exist_ok=True)
if (giftElement and giftElement["content"] and giftElement["content"] != "[]"):
print(metaName + " skiped")
with open(os.path.join(fullFolderPath, metaName+".json"), 'w', encoding="utf-8") as f:
json.dump(giftElementJson, ensure_ascii=False, indent=4, fp=f)
for gift in giftElementJson:
giftName = gift["name"]
if len(giftName) < 2:
giftName = ""
else:
giftName = giftName + "." + gift["listIconUrl"].split(".")[-1]
if gift["backgroundImage"] != "":
giftName = ""
saveFileFromLink(
link=gift["backgroundImage"],
liveFolderName=liveFolderName,
subFolderName=subFolderName,
localFilename=""
)
saveFileFromLink(
link=gift["listIconUrl"],
liveFolderName=liveFolderName,
subFolderName=subFolderName,
localFilename=giftName
)
else:
print("No " + metaName)
def grabZan(archivePath : str = os.path.join(os.getcwd(),"Archive"), skipdownload = True):
# print(soup.prettify()) #輸出排版後的HTML內容
headers = { "Cookie": cookie }
response = requests.get(
url=liveurl,
headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title")
print(title.text if (title and title.text != "") else "No Title given")
liveName = soup.find("meta", attrs={"name": "live-name"})
liveName = liveName["content"] if (liveName and liveName["content"]) else "NoLiveName"
print(liveName if liveName != "NoLiveName"
else "No meta live-name given")
liveName = translate_valid_filename(liveName)
url = soup.find("meta", attrs={"name": "live-url"})
print(url["content"] if (url and url["content"])
else "No meta live-url given")
openlivedate = soup.find("meta", attrs={"name": "open-live-date"})
print(isoToDatetime(openlivedate["content"]) if (openlivedate and openlivedate["content"])
else "No meta open-live-date given")
vodstartdate = soup.find("meta", attrs={"name": "vod-start-date"})
print(translateTs(vodstartdate["content"]) if (vodstartdate and vodstartdate["content"])
else "No meta vod-start-date given")
selfuserid = soup.find("meta", attrs={"name": "self-user-id"})
print(selfuserid["content"] if (selfuserid and selfuserid["content"])
else "No meta self-user-id given")
commentmanifest = soup.find("meta", attrs={"name": "vod-comment-manifest"})
if (commentmanifest and commentmanifest["content"]):
print(commentmanifest["content"])
commentIndexFilePath = saveFileFromLink(
link=commentmanifest["content"], liveFolderName=liveName, subFolderName="raw comments")
commentList = downloadJsonFromIndex(
commentIndexFilePath, liveFolderName=liveName, subFolderName="raw comments") if len(commentIndexFilePath) > 0 else ""
filteredCommentList = filterMainComment(
filenameList=commentList, adminUserId=selfuserid["content"]) if len(commentList) > 0 else ""
if len(filteredCommentList) <= 0:
print("No special comment!!!")
else:
constructContent = ""
for comment in filteredCommentList:
constructContent = constructContent + \
comment["user_name"]+": ("+comment["ts"].strftime("%H:%M:%S")+")\n"+comment["text"]+"\n"
print("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent)
with open(os.path.join(archivePath, liveName, "主持人評論.txt"), mode='w', encoding="utf-8") as f:
f.write("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent)
else:
print("No meta vod-comment-manifest given")
vodcomment = soup.find("meta", attrs={"name": "vod-comment-manifest-url"})
if (vodcomment and vodcomment["content"]):
print(vodcomment["content"])
commentIndexFilePath = saveFileFromLink(
link=vodcomment["content"], liveFolderName=liveName, subFolderName="raw comments")
commentList = downloadJsonFromIndex(
commentIndexFilePath, liveFolderName=liveName, subFolderName="raw comments") if len(commentIndexFilePath) > 0 else ""
filteredCommentList = filterMainComment(
filenameList=commentList, adminUserId=selfuserid["content"]) if len(commentList) > 0 else ""
if len(filteredCommentList) <= 0:
print("No special comment!!!")
else:
constructContent = ""
for comment in filteredCommentList:
constructContent = constructContent + \
comment["user_name"]+": ("+comment["ts"].strftime("%H:%M:%S")+")\n"+comment["text"]+"\n"
print("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent)
with open(os.path.join(archivePath, liveName, "主持人評論.txt"), mode='w', encoding='utf-8-sig') as f:
f.write("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent)
else:
print("No meta vod-comment-manifest-url given")
comment = soup.find("meta", attrs={"name": "comment-maniefst-url"})
if (comment and comment["content"]):
print(comment["content"])
refilenameList = downloadJsonFromManifest(comment["content"], liveName, subFolderName="re-live raw comments")
filteredReCommentList = filterMainComment(
filenameList=refilenameList, adminUserId=selfuserid["content"])
if len(filteredReCommentList) <= 0:
print("No Manifest special comment!!!")
else:
constructContent = ""
for comment in filteredReCommentList:
constructContent = constructContent + \
comment["user_name"]+": ("+comment["ts"].strftime("%H:%M:%S")+")\n"+comment["text"]+"\n"
print("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent) #start ts might change
with open(os.path.join(archivePath, liveName, "主持人評論Re.txt"), 'w', encoding='utf-8-sig') as f:
f.write("Start time("+translateTs(vodstartdate["content"]).strftime("%H:%M:%S")+")\n"+constructContent)
else:
print("No meta comment-maniefst-url given")
backGroundImageURL = soup.find("meta", attrs={"name": "design-backGroundImageURL"})
if (backGroundImageURL and backGroundImageURL["content"]):
print(backGroundImageURL["content"])
saveFileFromLink(
link=backGroundImageURL["content"], liveFolderName=liveName, subFolderName="pictures")
else:
print("No meta design-backGroundImageURL given")
titleImageURL = soup.find("meta", attrs={"name": "design-titleImageURL"})
if (titleImageURL and titleImageURL["content"]):
print(titleImageURL["content"])
saveFileFromLink(
link=titleImageURL["content"], liveFolderName=liveName, subFolderName="pictures")
else:
print("No meta design-titleImageURL given")
taptostartImage = soup.find("meta", attrs={"name": "design-taptostartImage"})
if (taptostartImage and taptostartImage["content"]):
print(taptostartImage["content"])
saveFileFromLink(
link=taptostartImage["content"], liveFolderName=liveName, subFolderName="pictures")
else:
print("No meta design-taptostartImage given")
verticalBackGroundImageURL = soup.find("meta", attrs={"name": "design-verticalBackGroundImageURL"})
if (verticalBackGroundImageURL and verticalBackGroundImageURL["content"]):
print(verticalBackGroundImageURL["content"])
saveFileFromLink(link=verticalBackGroundImageURL["content"],
liveFolderName=liveName, subFolderName="pictures")
else:
print("No meta design-verticalBackGroundImageURL given")
liveBanners = soup.find("meta", attrs={"name": "live-banners"})
if (liveBanners and liveBanners["content"] and liveBanners["content"] != "[]"):
liveBannerurl = json.loads(liveBanners["content"])[0]["bannerImageURL"]
if (liveBannerurl):
print(liveBannerurl)
saveFileFromLink(
link=liveBannerurl, liveFolderName=liveName, subFolderName="pictures")
else:
print("No meta live-banners given")
procGifts(soup, metaName="normalGifts",
liveFolderName=liveName, subFolderName="pictures")
procGifts(soup, metaName="specialGifts",
liveFolderName=liveName, subFolderName="pictures")
procGifts(soup, metaName="comboFinishGifts",
liveFolderName=liveName, subFolderName="pictures")
playscript = soup.find("script", attrs={"src": "https://static.zan-live.com/static/js/live/play.js"})
if playscript is not None:
nextscript = playscript.find_next()
while True:
if nextscript is None or "const liveTickets = JSON.parse(`" in nextscript.text:
break
nextscript = nextscript.find_next()
if nextscript is not None:
livetickets = nextscript.text.split("const liveTickets = JSON.parse(`")[1]
livetickets = livetickets.split("`);")[0]
if (livetickets):
liveticketsjson = json.loads(livetickets)
for ticket in liveticketsjson:
ticket["name"] = html.unescape(ticket["name"])\
.replace(u"\u00A0", " ") if ticket["name"] else ticket["name"]
ticket["liveName"] = html.unescape(ticket["liveName"])\
.replace(u"\u00A0", " ") if ticket["liveName"] else ticket["liveName"]
if ticket.get("sns", None) is not None:
ticket["sns"]["hashTags"] = html.unescape(ticket["sns"]["hashTags"])\
.replace(u"\u00A0", " ") if ticket["sns"]["hashTags"] else ticket["sns"]["hashTags"]
ticket["sns"]["message"] = html.unescape(ticket["sns"]["message"])\
.replace(u"\u00A0", " ") if ticket["sns"]["message"] else ticket["sns"]["message"]
print("has ticketinfo")
with open(os.path.join(archivePath, liveName, "ticketinfo.json"), 'w', encoding="utf-8") as f:
json.dump(liveticketsjson, ensure_ascii=False, indent=4, fp=f)
else:
print("No ticketinfo")
else:
print("No ticketinfo")
phoImg = soup.find('div', attrs={'class': 'phoImg'})
pcPhoImg = phoImg.find('img', attrs={'class': 'pc'})
spPhoImg = phoImg.find('img', attrs={'class': 'sp'})
if pcPhoImg is not None:
saveFileFromLink(
link=pcPhoImg["src"], liveFolderName=liveName, subFolderName="pictures")
print(pcPhoImg["src"])
if spPhoImg is not None:
saveFileFromLink(
link=spPhoImg["src"], liveFolderName=liveName, subFolderName="pictures")
print(spPhoImg["src"])
for detailData in soup.find_all("div", attrs={'class': 'detailData'}):
for loadedpic in detailData.find_all("img"):
if loadedpic is not None:
saveFileFromLink(
link=loadedpic["src"], liveFolderName=liveName, subFolderName="pictures")
if __name__ == "__main__":
liveurl = input("zan-live url:")
cookie = input("cookie value:")
XCsrfToken = input("XCsrfToken value(for getting username, optional):")
grabZan()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment