Skip to content

Instantly share code, notes, and snippets.

@boogheta
Last active November 9, 2022 15:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boogheta/6babff3c34714d28814295729cbed22a to your computer and use it in GitHub Desktop.
Save boogheta/6babff3c34714d28814295729cbed22a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import sys
import csv
import json
import requests
metadata = {
"id": None, "desc": None, "createTime": None, "originalItem": None,
"video": ["id", "height", "width", "duration", "ratio", "cover", "originCover", "dynamicCover", "downloadAddr", "reflowCover", "bitrate", "format", "videoQuality"],
"author": ["id", "uniqueId", "nickname", "avatarLarger", "signature", "verified", "secret"],
"authorStats": ["followingCount", "followerCount", "heartCount", "videoCount", "diggCount", "heart"],
"music": ["id", "title", "authorName", "original", "playUrl", "duration"],
"stats": ["diggCount", "shareCount", "commentCount", "playCount"]
# "duetInfo": [],
}
multi_fields = {
"textExtra" : "hashtagName",
# "challenges" : "title", redondant textExtra
"stickersOnItem": "stickerText"
}
fields = []
def parse_curl_query(query):
if not query.startswith("curl '"):
sys.exit("Query is badly formatted")
query = query.replace("curl '", "", 1)
url, args = query.split("'", 1)
#print(url)
headers = {}
args = args.replace(" -H '", "", 1).rstrip("'")
for arg in args.split("' -H '"):
key, value = arg.split(": ", 1)
if key.lower() == "accept-encoding":
continue
headers[key] = value
#print(headers)
return url, headers
def downloadAndScrape(url, headers, nb_results, output_dir):
nb_pages = int(nb_results / 12)
page = 0
results = []
print("Calling base url %s" % url)
while page < nb_pages:
offset = 12 * page
page_url = url.replace("offset=0", "offset=%d" % offset)
print("Requesting page %s" % page)
data = requests.get(page_url, headers=headers).json()["data"]
cache_file = os.path.join(output_dir, "results-%d.json" % page)
for v in data:
video = {}
if v["type"] != 1:
continue
v = v["item"]
for key, val in metadata.items():
if val and type(val) == list:
for field in val:
fieldname = "%s.%s" % (key, field)
if fieldname not in fields:
fields.append(fieldname)
video[fieldname] = v[key][field]
else:
video[key] = v[key]
if key not in fields:
fields.append(key)
for key, val in multi_fields.items():
resKey = key + "s"
if key in v:
video[resKey] = "|".join("|".join(el[val]) if type(el[val]) == list else el[val] for el in v[key])
if resKey not in fields:
fields.append(resKey)
results.append(video)
page += 1
return results
if __name__ == "__main__":
if len(sys.argv) < 4:
sys.exit("""
To run this script, first connect to TikTok in your browser, open the developer tools (F12) and its "Network" tab, then do the search you're interested in. In the Network tab select XHR and search inside the line with "www.tiktok.com" as Domain, and File starting with "/api/search/general/full/", right click on it and do "Copy as cURL". Then run the script as:\n\npython scrapeTikTokSearch.py "HERE PASTE THE QUERY YOU JUST COPIED" NUMBER_OF_DESIRED_VIDEOS NAME_OF_THE_DIRECTORY_IN_WHICH_EVERYTHING_WILL_BE_COLLECTED
""")
curl_query = sys.argv[1]
nb_results = int(sys.argv[2])
output_dir = sys.argv[3]
url, headers = parse_curl_query(curl_query)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
results = downloadAndScrape(url, headers, nb_results, output_dir)
output_file = os.path.join(output_dir, "videos.csv")
with open(output_file, "w") as f:
wr = csv.DictWriter(f, fields)
wr.writeheader()
wr.writerows(results)
print("\n\nMetadata on %d videos downloaded in %s.\n\nDownload the corresponding videos by running:\nminet fetch --filename-template \"{line['id']}.{line['video.format']}\" video.downloadAddr %s -d %s" % (len(results), output_file, output_file, output_dir))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment