Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Shallow download files from Google Drive folder with requests,bs4 and regex
import re
from bs4 import BeautifulSoup
import requests
import gdown
import json
import argparse
parser=argparse.ArgumentParser()
parser.add_argument("url")
args=parser.parse_args()
r=requests.get(args.url)
soup=BeautifulSoup(r.text,"lxml")
string_regex=re.compile(r"\'([^\']+)\'") #pretty dumb string regex, would fail in scaped strings
encoded_data=None
for script in soup.select("script"):
if "_DRIVE_ivd" in script.text: # hacky script tag search
encoded_data=string_regex.findall(script.text)[1] # second one, first one is '_DRIVE_ivdc'
break
if encoded_data==None:
raise RuntimeError("Didn't found script tag")
decoded = bytes(encoded_data, "utf-8").decode("unicode_escape")
data=json.loads(decoded)
ids=[elem[0] for elem in data[0]] #don't know why these indices
for id_ in ids:
gdown.download('https://drive.google.com/uc?id='+id_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment