Created
June 25, 2013 06:15
-
-
Save JimHokanson/5856338 to your computer and use it in GitHub Desktop.
Rough version of FTP calls for retrieving tracker data. Need to add on saving code and check possible change to change directory call.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
import cPickle as pickle | |
except: | |
import pickle | |
from ftplib import FTP | |
import re | |
import time #For modification time parsing | |
#For later | |
#---------------------------------------------- | |
#import os | |
#import h5py | |
""" | |
CODE DESIGN OUTLINES | |
===================================================================== | |
1) Provide interface for retrieving data, perhaps eventually class | |
this and provide caching ability and subclass implementations for specific | |
ways of retrieving the data | |
""" | |
CACHED_FTP_RESULTS_PATH = r"F:\worm_data\ftp_cache.p" | |
#Example strings for regexp | |
#'drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 C11D2.2' | |
#['drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 ok1565'] | |
#-rw-r--r-- 1 24439 24439 25180977 Nov 12 2012 CIID2.2 (ok1565)IV on food R_2011_08_04__12_26_51___1___6_features.mat | |
# nlinks owner group size month day year filename | |
RE_COMPILED_PATTERN = re.compile('(?P<attr>\S*)\s*(\d+)\s*([^ ]*)\s*([^ ]*)\s*(\d+)\s*(?P<month>\S*)\s*(?P<day>\d+)\s*(?P<year>\d+)\s*(?P<name>.*)') | |
BASE_FTP_PATH = "/pub/tjucikas/wormdatabase/results-12-06-08/Laura Grundy" | |
FTP_URL = "ftp.mrc-lmb.cam.ac.uk" | |
#======================================================================================== | |
def get_parsed_listing(RE_COMPILED_PATTERN,list_str_element): | |
""" | |
dict_listing = get_parsed_listing(re_compiled_pattern,list_element): | |
dict_listing | |
type - single character file type | |
name - file or folder name | |
last_modified_num - numeric for last modified time | |
last_modified_str - last modified time as string | |
IMPROVEMENTS: | |
some properties from the listing are currently unparsed | |
""" | |
#NOTE: Each item in the list contains a string roughly as follows: | |
#'drwxr-xr-x 3 24439 24439 4096 Nov 12 2012 C11D2.2' | |
# | |
# FTP clients surprisingly can be fairly loose about how they respond | |
# to requests. The regular expression is hardcoded for the above pattern. | |
m = RE_COMPILED_PATTERN.search(list_str_element) | |
if m is None: | |
raise Exception("Regular expression error parsing this string:\n{0}".format(list_str_element)) | |
d = m.groupdict() #dictionary containing named tokens | |
#---------------------------------------------------- | |
#1) name : name of file or directory | |
#2) attr : unix attributes, unparsed | |
# - 'drwxr-xr-x' | |
# | |
# 1st char | |
# : d : directory | |
# : l : link | |
# : - : file | |
# : s : socket file | |
# - '-rw-r--r--' | |
#3) month, day, year | |
#NOTE: their might be a better way of doing this, could change regexp above | |
last_modified_str = d["month"] + ' ' + d["day"] + ' ' + d["year"] | |
last_modified_time = time.strptime(last_modified_str, "%b %d %Y") | |
dict_listing = {'type':d["attr"][0], 'name':d["name"], 'last_modified_str':last_modified_str, | |
'last_modified_num': time.mktime(last_modified_time)} | |
return dict_listing | |
#========================================================================================= | |
def get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,base_path): | |
""" | |
:param ftp: | |
:param file_list: | |
:param re_compiled_pattern: | |
:param base_path: | |
""" | |
ftp.cwd(base_path) | |
dict_output = {} | |
cur_dir_contents_raw = [] #Will hold | |
ftp.retrlines('LIST', cur_dir_contents_raw.append) | |
for list_element in cur_dir_contents_raw: | |
#Parse the string into a meaningful dictionary | |
d = get_parsed_listing(RE_COMPILED_PATTERN,list_element) | |
cur_listing_type = d["type"] | |
cur_name = d["name"] | |
#DEBUG LINE | |
print('Dir: ' + base_path + '/' + cur_name) | |
if cur_listing_type == 'd': | |
# Recursive call - change to directory and get contents | |
d['child'] = get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,base_path + '/' + cur_name) | |
elif cur_listing_type == '-': | |
# append to file list | |
file_list.append(base_path + '/' + cur_name) | |
else: | |
print('FTP warning: unhandled type: "{0}" with name: "{1}" at: "{2}"'.format(cur_listing_type,cur_name,base_path)) | |
dict_output[cur_name] = d | |
return dict_output | |
#======================================================================================================================= | |
def get_ftp_connection(FTP_URL): | |
print("Initializing FTP object:") | |
ftp = FTP(FTP_URL) | |
print("Connecting to: {0}".format(FTP_URL)) | |
ftp.login() | |
print("Connected") | |
return ftp | |
#MAIN CODE | |
#======================================================================================================================= | |
ftp = get_ftp_connection(FTP_URL) | |
file_list = [] | |
final_dict = get_files_in_directory(ftp,file_list,RE_COMPILED_PATTERN,BASE_FTP_PATH) | |
ftp.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment