Skip to content

Instantly share code, notes, and snippets.

@Jwely
Last active March 4, 2024 11:39
Show Gist options
  • Star 42 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save Jwely/ad8eb800bacef9e34dd775f9b3aad987 to your computer and use it in GitHub Desktop.
Save Jwely/ad8eb800bacef9e34dd775f9b3aad987 to your computer and use it in GitHub Desktop.
recursive ftp directory downloader with python
import ftplib
import os
import re
"""
MIT license: 2017 - Jwely
Example usage:
``` python
import ftplib
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir)
```
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the
directory and its entire contents into the "local_dir".
*** Note that if wget is an option, I recommend using that instead ***
"""
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True):
""" simply determines if an item listed on the ftp server is a valid directory or not """
# if the name has a "." in the fourth to last position, its probably a file extension
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time.
if guess_by_extension is True:
if len(name) >= 4:
if name[-4] == '.':
return False
original_cwd = ftp_handle.pwd() # remember the current working directory
try:
ftp_handle.cwd(name) # try to set directory to new name
ftp_handle.cwd(original_cwd) # set it back to what it was
return True
except ftplib.error_perm as e:
print(e)
return False
except Exception as e:
print(e)
return False
def _make_parent_dir(fpath):
""" ensures the parent directory of a filepath exists """
dirname = os.path.dirname(fpath)
while not os.path.exists(dirname):
try:
os.makedirs(dirname)
print("created {0}".format(dirname))
except OSError as e:
print(e)
_make_parent_dir(dirname)
def _download_ftp_file(ftp_handle, name, dest, overwrite):
""" downloads a single file from an ftp server """
_make_parent_dir(dest.lstrip("/"))
if not os.path.exists(dest) or overwrite is True:
try:
with open(dest, 'wb') as f:
ftp_handle.retrbinary("RETR {0}".format(name), f.write)
print("downloaded: {0}".format(dest))
except FileNotFoundError:
print("FAILED: {0}".format(dest))
else:
print("already exists: {0}".format(dest))
def _file_name_match_patern(pattern, name):
""" returns True if filename matches the pattern"""
if pattern is None:
return True
else:
return bool(re.match(pattern, name))
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension, pattern):
""" replicates a directory on an ftp server recursively """
for item in ftp_handle.nlst(name):
if _is_ftp_dir(ftp_handle, item, guess_by_extension):
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension, pattern)
else:
if _file_name_match_patern(pattern, name):
_download_ftp_file(ftp_handle, item, item, overwrite)
else:
# quietly skip the file
pass
def download_ftp_tree(ftp_handle, path, destination, pattern=None, overwrite=False, guess_by_extension=True):
"""
Downloads an entire directory tree from an ftp server to the local destination
:param ftp_handle: an authenticated ftplib.FTP instance
:param path: the folder on the ftp server to download
:param destination: the local directory to store the copied folder
:param pattern: Python regex pattern, only files that match this pattern will be downloaded.
:param overwrite: set to True to force re-download of all files, even if they appear to exist already
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file.
if this flag is set to True, it will assume any file ending with a three character extension ".???" is
a file and not a directory. Set to False if some folders may have a "." in their names -4th position.
"""
path = path.lstrip("/")
original_directory = os.getcwd() # remember working directory before function is executed
os.chdir(destination) # change working directory to ftp mirror directory
_mirror_ftp_dir(
ftp_handle,
path,
pattern=pattern,
overwrite=overwrite,
guess_by_extension=guess_by_extension)
os.chdir(original_directory) # reset working directory to what it was before function exec
if __name__ == "__main__":
# Example usage mirroring all jpg files in an FTP directory tree.
mysite = "some_ftp_site"
username = "anonymous"
password = None
remote_dir = ""
local_dir = ""
pattern = ".*\.jpg$"
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir, pattern=pattern, overwrite=False, guess_by_extension=True)
@williameast
Copy link

As i could not edit my comment for some reason:

I found a workaround for my issue!
While this has not been proven to work with all edge cases (japanese characters still do not work so far!) I just create the nlst() output from mlsd(): and just calling it instead of ftp_handle.nlst()` in line 84. this now has no problem with directories with square brackets in them.

   def nlstSafe(self, directory):
       out = []
       for item in ftp_handle.mlsd(directory):
           out.append(os.path.join(directory, item[0]))
       return out

@MtBook-FUK
Copy link

MtBook-FUK commented Jan 12, 2023

@Jwely, Thanks for this code!
I have a question about the _mirror_ftp_dir function. Shouldn't the second argument of the _file_name_match_patern function be "item" instead of "name"? Since "name" refers to the directory path, _file_name_match_patern(pattern, name) would be applying the regular expression to the directory path. I think "item" would be appropriate if you want to apply the regular expression to the file path.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment