Skip to content

Instantly share code, notes, and snippets.

@cnDelbert
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.
Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.
Download files from an http server which allows traversing.
# -*- coding: utf-8 -*-
__author__ = 'Delbert'
# Download files from an http server which allows traversing.
# Python 3 Only.
# requests and BeautifulSoup4 are required.
from bs4 import BeautifulSoup
import urllib
import requests
import os
def init():
global ignoredDir
basepath = "HTTP SERVER ADDRESS" # Begin with http:// or ftp:// or https://
downpath = "DIRECTORY TO STORE" # Relative is preferred
ignoredDir = {'IGNORED FILE OR PATH'} # Shown text only, not relative directory
parse(basepath, downpath)
def parse(baseurl, localpath):
print(localpath)
currentUrl = baseurl
currentLocalPath = localpath
req = requests.get(currentUrl)
raw_data = BeautifulSoup(req.text)
all_link = raw_data.find_all("a")
if all_link == []: # If the directory contains an empty index.html or others
return
if not all_link[0]["href"].startswith("?C="): # If the directory contains a default page.
download(currentUrl, currentLocalPath + "index.html")
return
for d_link in all_link:
if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory
continue
if d_link.text in ignoredDir:
continue
if d_link.text.endswith('/'): # A link to a child directory
if not os.path.exists(currentLocalPath + d_link.text):
os.mkdir(currentLocalPath + d_link.text)
parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
else:
if not os.path.exists(currentLocalPath):
os.mkdir(currentLocalPath)
download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])
def download(downloadUrl, saveFile):
print(urllib.parse.unquote(downloadUrl))
if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0:
return
furl = open("./furl.txt", "at", encoding='utf-8')
furl.write(urllib.parse.unquote(downloadUrl) + '\n')
furl.close()
r = requests.get(downloadUrl)
# print(r.headers.get('content-type', 'unknown').lower())
content_type = r.headers.get('content-type', 'unknown').lower()
if content_type.startswith("text"): # If it's a text file
dfile = open(urllib.parse.unquote(saveFile), "wt")
try:
temp = r.text.decode().encode('utf-8', 'ignore')
except:
temp = r.text.encode('utf-8').decode('gb18030')
dfile.write(temp)
else:
dfile = open(saveFile, "wb")
dfile.write(r.content)
dfile.close()
def main():
init()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment