cnDelbert/DownTraverse.py

## DownTraverse.py
# -*- coding: utf-8 -*-
__author__ = 'Delbert'
# Download files from an http server which allows traversing.
# Python 3 Only.
# requests and BeautifulSoup4 are required.

from bs4 import BeautifulSoup
import urllib
import requests
import os


def init():
    global ignoredDir
    basepath = "HTTP SERVER ADDRESS"    # Begin with http:// or ftp:// or https://
    downpath = "DIRECTORY TO STORE"     # Relative is preferred
    ignoredDir = {'IGNORED FILE OR PATH'}   # Shown text only, not relative directory
    parse(basepath, downpath)


def parse(baseurl, localpath):
    print(localpath)
    currentUrl = baseurl
    currentLocalPath = localpath
    req = requests.get(currentUrl)
    raw_data = BeautifulSoup(req.text)
    all_link = raw_data.find_all("a")

    if all_link == []:  # If the directory contains an empty index.html or others
        return

    if not all_link[0]["href"].startswith("?C="):   # If the directory contains a default page.
        download(currentUrl, currentLocalPath + "index.html")
        return

    for d_link in all_link:

        if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory":   # If it's an empty directory
            continue

        if d_link.text in ignoredDir:
            continue

        if d_link.text.endswith('/'):   # A link to a child directory
            if not os.path.exists(currentLocalPath + d_link.text):
                os.mkdir(currentLocalPath + d_link.text)
            parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
        else:
            if not os.path.exists(currentLocalPath):
                os.mkdir(currentLocalPath)
            download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])


def download(downloadUrl, saveFile):
    print(urllib.parse.unquote(downloadUrl))
    if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0:
        return

    furl = open("./furl.txt", "at", encoding='utf-8')
    furl.write(urllib.parse.unquote(downloadUrl) + '\n')
    furl.close()

    r = requests.get(downloadUrl)
    # print(r.headers.get('content-type', 'unknown').lower())
    content_type = r.headers.get('content-type', 'unknown').lower()
    if content_type.startswith("text"):    # If it's a text file
        dfile = open(urllib.parse.unquote(saveFile), "wt")
        try:
            temp = r.text.decode().encode('utf-8', 'ignore')
        except:
            temp = r.text.encode('utf-8').decode('gb18030')
        dfile.write(temp)
    else:
        dfile = open(saveFile, "wb")
        dfile.write(r.content)
    dfile.close()


def main():
    init()


if __name__ == '__main__':
    main()
	# -- coding: utf-8 --
	__author__ = 'Delbert'
	# Download files from an http server which allows traversing.
	# Python 3 Only.
	# requests and BeautifulSoup4 are required.

	from bs4 import BeautifulSoup
	import urllib
	import requests
	import os


	def init():
	global ignoredDir
	basepath = "HTTP SERVER ADDRESS" # Begin with http:// or ftp:// or https://
	downpath = "DIRECTORY TO STORE" # Relative is preferred
	ignoredDir = {'IGNORED FILE OR PATH'} # Shown text only, not relative directory
	parse(basepath, downpath)


	def parse(baseurl, localpath):
	print(localpath)
	currentUrl = baseurl
	currentLocalPath = localpath
	req = requests.get(currentUrl)
	raw_data = BeautifulSoup(req.text)
	all_link = raw_data.find_all("a")

	if all_link == []: # If the directory contains an empty index.html or others
	return

	if not all_link[0]["href"].startswith("?C="): # If the directory contains a default page.
	download(currentUrl, currentLocalPath + "index.html")
	return

	for d_link in all_link:

	if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory
	continue

	if d_link.text in ignoredDir:
	continue

	if d_link.text.endswith('/'): # A link to a child directory
	if not os.path.exists(currentLocalPath + d_link.text):
	os.mkdir(currentLocalPath + d_link.text)
	parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
	else:
	if not os.path.exists(currentLocalPath):
	os.mkdir(currentLocalPath)
	download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])


	def download(downloadUrl, saveFile):
	print(urllib.parse.unquote(downloadUrl))
	if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0:
	return

	furl = open("./furl.txt", "at", encoding='utf-8')
	furl.write(urllib.parse.unquote(downloadUrl) + '\n')
	furl.close()

	r = requests.get(downloadUrl)
	# print(r.headers.get('content-type', 'unknown').lower())
	content_type = r.headers.get('content-type', 'unknown').lower()
	if content_type.startswith("text"): # If it's a text file
	dfile = open(urllib.parse.unquote(saveFile), "wt")
	try:
	temp = r.text.decode().encode('utf-8', 'ignore')
	except:
	temp = r.text.encode('utf-8').decode('gb18030')
	dfile.write(temp)
	else:
	dfile = open(saveFile, "wb")
	dfile.write(r.content)
	dfile.close()


	def main():
	init()


	if __name__ == '__main__':
	main()