Last active
November 9, 2022 15:22
-
-
Save boogheta/6babff3c34714d28814295729cbed22a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import csv | |
import json | |
import requests | |
metadata = { | |
"id": None, "desc": None, "createTime": None, "originalItem": None, | |
"video": ["id", "height", "width", "duration", "ratio", "cover", "originCover", "dynamicCover", "downloadAddr", "reflowCover", "bitrate", "format", "videoQuality"], | |
"author": ["id", "uniqueId", "nickname", "avatarLarger", "signature", "verified", "secret"], | |
"authorStats": ["followingCount", "followerCount", "heartCount", "videoCount", "diggCount", "heart"], | |
"music": ["id", "title", "authorName", "original", "playUrl", "duration"], | |
"stats": ["diggCount", "shareCount", "commentCount", "playCount"] | |
# "duetInfo": [], | |
} | |
multi_fields = { | |
"textExtra" : "hashtagName", | |
# "challenges" : "title", redondant textExtra | |
"stickersOnItem": "stickerText" | |
} | |
fields = [] | |
def parse_curl_query(query): | |
if not query.startswith("curl '"): | |
sys.exit("Query is badly formatted") | |
query = query.replace("curl '", "", 1) | |
url, args = query.split("'", 1) | |
#print(url) | |
headers = {} | |
args = args.replace(" -H '", "", 1).rstrip("'") | |
for arg in args.split("' -H '"): | |
key, value = arg.split(": ", 1) | |
if key.lower() == "accept-encoding": | |
continue | |
headers[key] = value | |
#print(headers) | |
return url, headers | |
def downloadAndScrape(url, headers, nb_results, output_dir): | |
nb_pages = int(nb_results / 12) | |
page = 0 | |
results = [] | |
print("Calling base url %s" % url) | |
while page < nb_pages: | |
offset = 12 * page | |
page_url = url.replace("offset=0", "offset=%d" % offset) | |
print("Requesting page %s" % page) | |
data = requests.get(page_url, headers=headers).json()["data"] | |
cache_file = os.path.join(output_dir, "results-%d.json" % page) | |
for v in data: | |
video = {} | |
if v["type"] != 1: | |
continue | |
v = v["item"] | |
for key, val in metadata.items(): | |
if val and type(val) == list: | |
for field in val: | |
fieldname = "%s.%s" % (key, field) | |
if fieldname not in fields: | |
fields.append(fieldname) | |
video[fieldname] = v[key][field] | |
else: | |
video[key] = v[key] | |
if key not in fields: | |
fields.append(key) | |
for key, val in multi_fields.items(): | |
resKey = key + "s" | |
if key in v: | |
video[resKey] = "|".join("|".join(el[val]) if type(el[val]) == list else el[val] for el in v[key]) | |
if resKey not in fields: | |
fields.append(resKey) | |
results.append(video) | |
page += 1 | |
return results | |
if __name__ == "__main__": | |
if len(sys.argv) < 4: | |
sys.exit(""" | |
To run this script, first connect to TikTok in your browser, open the developer tools (F12) and its "Network" tab, then do the search you're interested in. In the Network tab select XHR and search inside the line with "www.tiktok.com" as Domain, and File starting with "/api/search/general/full/", right click on it and do "Copy as cURL". Then run the script as:\n\npython scrapeTikTokSearch.py "HERE PASTE THE QUERY YOU JUST COPIED" NUMBER_OF_DESIRED_VIDEOS NAME_OF_THE_DIRECTORY_IN_WHICH_EVERYTHING_WILL_BE_COLLECTED | |
""") | |
curl_query = sys.argv[1] | |
nb_results = int(sys.argv[2]) | |
output_dir = sys.argv[3] | |
url, headers = parse_curl_query(curl_query) | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
results = downloadAndScrape(url, headers, nb_results, output_dir) | |
output_file = os.path.join(output_dir, "videos.csv") | |
with open(output_file, "w") as f: | |
wr = csv.DictWriter(f, fields) | |
wr.writeheader() | |
wr.writerows(results) | |
print("\n\nMetadata on %d videos downloaded in %s.\n\nDownload the corresponding videos by running:\nminet fetch --filename-template \"{line['id']}.{line['video.format']}\" video.downloadAddr %s -d %s" % (len(results), output_file, output_file, output_dir)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment