Skip to content

Instantly share code, notes, and snippets.

@mooware
Last active January 30, 2024 23:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mooware/bee55dd487e24c90961656d7a3282950 to your computer and use it in GitHub Desktop.
Save mooware/bee55dd487e24c90961656d7a3282950 to your computer and use it in GitHub Desktop.
# takes a csv export of the google sheet as input and tries to download pastebins
#
# usage:
# download_pastebins.py <prefix> <name column> <url column> <csv file>
#
# example:
# download_pastebins.py mt18 1 5 "MT18 Draw List - Played Games.csv"
import sys, os, re, urllib.request, csv
def sanitize_filename(name):
return re.sub("[^a-zA-Z0-9._ -]", "_", name)
def get_download_url(url):
format = 'txt'
url = url.replace("/raw", "")
if 'pastebin.com' in url:
dlurl = url.replace("pastebin.com/", "pastebin.com/raw/")
elif 'pastebin.pl' in url:
dlurl = url.replace("view/", "view/raw/")
elif 'paste.ee' in url:
dlurl = url.replace("/r/", "/d/").replace("/p/", "/d/")
elif 'rentry' in url:
dlurl = url + "/raw"
elif 'drive.google.com' in url:
dlurl = re.sub("file/d/([^/]+).+$", "uc?id=\\1&export=download", url)
elif 'docs.google.com' in url:
dlurl = re.sub("/edit.+$", "/export?format=pdf", url)
format = 'pdf'
else:
dlurl = url
return (format, dlurl)
def download(url):
return urllib.request.urlopen(url).read()
def download_list(prefix, table):
for row in table:
name = row[0].strip()
url = row[1].strip()
format, dlurl = get_download_url(url)
filename = sanitize_filename(prefix + '_' + name + '_' + url + '.' + format)
if os.path.isfile(filename):
continue
print("download", name, dlurl, "into", filename)
try:
rawtext = download(dlurl)
with open(filename, "wb") as f:
f.write(rawtext)
except Exception as e:
print("exception:", e)
if __name__ == '__main__':
prefix = sys.argv[1]
namecol = int(sys.argv[2])
urlcol = int(sys.argv[3])
infile = sys.argv[4]
data = csv.reader(open(infile, "r", encoding="utf8"))
table = [(x[namecol], x[urlcol]) for x in data if x[urlcol].startswith("http")]
download_list(prefix, table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment