Skip to content

Instantly share code, notes, and snippets.

@JimHokanson
Created June 25, 2013 06:15
Show Gist options
  • Save JimHokanson/5856338 to your computer and use it in GitHub Desktop.
Save JimHokanson/5856338 to your computer and use it in GitHub Desktop.
Rough version of FTP calls for retrieving tracker data. Need to add on saving code and check possible change to change directory call.
try:
import cPickle as pickle
except:
import pickle
from ftplib import FTP
import re
import time #For modification time parsing
#For later
#----------------------------------------------
#import os
#import h5py
"""
CODE DESIGN OUTLINES
=====================================================================
1) Provide interface for retrieving data, perhaps eventually class
this and provide caching ability and subclass implementations for specific
ways of retrieving the data
"""
CACHED_FTP_RESULTS_PATH = r"F:\worm_data\ftp_cache.p"
#Example strings for regexp
#'drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 C11D2.2'
#['drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 ok1565']
#-rw-r--r-- 1 24439 24439 25180977 Nov 12 2012 CIID2.2 (ok1565)IV on food R_2011_08_04__12_26_51___1___6_features.mat
# nlinks owner group size month day year filename
RE_COMPILED_PATTERN = re.compile('(?P<attr>\S*)\s*(\d+)\s*([^ ]*)\s*([^ ]*)\s*(\d+)\s*(?P<month>\S*)\s*(?P<day>\d+)\s*(?P<year>\d+)\s*(?P<name>.*)')
BASE_FTP_PATH = "/pub/tjucikas/wormdatabase/results-12-06-08/Laura Grundy"
FTP_URL = "ftp.mrc-lmb.cam.ac.uk"
#========================================================================================
def get_parsed_listing(RE_COMPILED_PATTERN,list_str_element):
"""
dict_listing = get_parsed_listing(re_compiled_pattern,list_element):
dict_listing
type - single character file type
name - file or folder name
last_modified_num - numeric for last modified time
last_modified_str - last modified time as string
IMPROVEMENTS:
some properties from the listing are currently unparsed
"""
#NOTE: Each item in the list contains a string roughly as follows:
#'drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 C11D2.2'
#
# FTP clients surprisingly can be fairly loose about how they respond
# to requests. The regular expression is hardcoded for the above pattern.
m = RE_COMPILED_PATTERN.search(list_str_element)
if m is None:
raise Exception("Regular expression error parsing this string:\n{0}".format(list_str_element))
d = m.groupdict() #dictionary containing named tokens
#----------------------------------------------------
#1) name : name of file or directory
#2) attr : unix attributes, unparsed
# - 'drwxr-xr-x'
#
# 1st char
# : d : directory
# : l : link
# : - : file
# : s : socket file
# - '-rw-r--r--'
#3) month, day, year
#NOTE: their might be a better way of doing this, could change regexp above
last_modified_str = d["month"] + ' ' + d["day"] + ' ' + d["year"]
last_modified_time = time.strptime(last_modified_str, "%b %d %Y")
dict_listing = {'type':d["attr"][0], 'name':d["name"], 'last_modified_str':last_modified_str,
'last_modified_num': time.mktime(last_modified_time)}
return dict_listing
#=========================================================================================
def get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,base_path):
"""
:param ftp:
:param file_list:
:param re_compiled_pattern:
:param base_path:
"""
ftp.cwd(base_path)
dict_output = {}
cur_dir_contents_raw = [] #Will hold
ftp.retrlines('LIST', cur_dir_contents_raw.append)
for list_element in cur_dir_contents_raw:
#Parse the string into a meaningful dictionary
d = get_parsed_listing(RE_COMPILED_PATTERN,list_element)
cur_listing_type = d["type"]
cur_name = d["name"]
#DEBUG LINE
print('Dir: ' + base_path + '/' + cur_name)
if cur_listing_type == 'd':
# Recursive call - change to directory and get contents
d['child'] = get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,base_path + '/' + cur_name)
elif cur_listing_type == '-':
# append to file list
file_list.append(base_path + '/' + cur_name)
else:
print('FTP warning: unhandled type: "{0}" with name: "{1}" at: "{2}"'.format(cur_listing_type,cur_name,base_path))
dict_output[cur_name] = d
return dict_output
#=======================================================================================================================
def get_ftp_connection(FTP_URL):
print("Initializing FTP object:")
ftp = FTP(FTP_URL)
print("Connecting to: {0}".format(FTP_URL))
ftp.login()
print("Connected")
return ftp
#MAIN CODE
#=======================================================================================================================
ftp = get_ftp_connection(FTP_URL)
file_list = []
final_dict = get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,BASE_FTP_PATH)
ftp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment