-
-
Save ezdiy/17855d7421bbb416cbb3d8e0e1caf213 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import traceback | |
import time | |
import requests | |
import json | |
import sys | |
outdb = sys.argv[1] | |
outsrc = sys.argv[2] | |
sess = requests.Session() | |
def req(par): | |
backoff = 1 | |
while True: | |
try: | |
u = "https://api.vid.me/videos/list?%s" % par | |
data=sess.get(u).text | |
if not data: return None | |
return json.loads(data) | |
except: | |
traceback.print_exc() | |
backoff *= 2 | |
print(par) | |
time.sleep(backoff) | |
if True: | |
info = req('limit=1') | |
vid = int(info['videos'][0]['video_id']) | |
total = info['page']['total'] | |
print("[*] Scraping db of %d videos, top vid=%d" % (total, vid)) | |
else: | |
vid = 10000 | |
total = 9999 | |
vid=17450034 | |
#vid=17340513 | |
srco = open(outsrc, "w") | |
known=set() | |
basestep = 500 | |
step = basestep | |
low = vid - step | |
range = 10000 | |
currfile = 0 | |
dbo = None | |
while True: | |
resp = req('minVideoId=%d&maxVideoId=%d&limit=100' % (low,vid+1)) | |
if resp == None: | |
vid -= 99 | |
low = max(vid - step,0) | |
continue | |
done = 0 | |
for v in resp['videos']: | |
nvid = int(v['video_id']) | |
if nvid in known: | |
continue | |
done += 1 | |
known.add(nvid) | |
vid = nvid | |
src = v["source"] | |
if src and ('/' not in src): | |
src = None | |
if src: | |
srco.write("%d %s %s\n" % (vid,v['url'],src)) | |
else: | |
srco.write("%d %s\n" % (vid,v['url'])) | |
v['complete_url'] = None | |
rng = vid - vid % range | |
if rng != currfile: | |
if dbo: | |
dbo.write("null]") | |
dbo.close() | |
currfile = rng | |
nfn = "%s_%08d.json" % (outdb,rng) | |
print("[*] New dump %s" % nfn) | |
dbo = open(nfn, "w") | |
dbo.write("[") | |
dbo.write(json.dumps(v) + ",") | |
if not done: | |
step += basestep | |
low = max(vid - step,0) | |
print("Raising step to %d" % step) | |
continue | |
step = basestep | |
got = len(known) | |
print("[*] %d/%d done, %.3f%%, top=%d" % (got, total, got * 100.0 / total, vid)) | |
low = max(vid - step,0) | |
if low == 0: | |
break | |
dbo.write("null]") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment