Skip to content

Instantly share code, notes, and snippets.

@Steven24K
Created December 8, 2020 17:11
Show Gist options
  • Save Steven24K/931480f4f2d6f81a0fae9c49337ac6c9 to your computer and use it in GitHub Desktop.
Save Steven24K/931480f4f2d6f81a0fae9c49337ac6c9 to your computer and use it in GitHub Desktop.
import csv
import os
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import shutil
import datetime
import functools
# Just a pair to hold the name and file size
class MyImage:
def __init__(self, filename, fileSize, path):
self.FileName = filename
self.fileSize = fileSize
self.AbsoultePath = path
INPUT_DIRECTORY = './input'
OUTPUT_DIRECTORY = './output'
CSV_file = 'file_managed.csv'
use_file_size= True
matched_files = []
files_not_found = []
print('Started reading files from {} :: input directory'.format(INPUT_DIRECTORY))
# Get all files from directory
#input_files = [f for f in listdir(INPUT_DIRECTORY) if isfile(join(INPUT_DIRECTORY, f))]
input_files = []
for root,d_names,f_names in os.walk(INPUT_DIRECTORY):
for f in f_names:
real_path = os.path.join(root, f)
input_files.append(MyImage(f, Path(real_path).stat().st_size, real_path))
print(input_files)
print('INPUT FILES IN ' + INPUT_DIRECTORY + ' :' + str(len(input_files)))
# Check all if the files in the CSV
csvFile = open(CSV_file, newline='')
filereader = csv.reader(csvFile, delimiter=',', quotechar='"')
for row in filereader:
filename = row[4]
if filename in map(lambda x: x.FileName, input_files):
index = list(map(lambda x: x.FileName, input_files)).index(filename)
matched_files.append((row, input_files[index]))
else:
files_not_found.append(row)
# Check if file size matches (RANGE)
if use_file_size:
for row in matched_files:
filesize = row[0][7]
if not int(filesize) in map(lambda x: x.fileSize, input_files):
matched_files.remove(row)
files_not_found.append(row)
#Copy files to output dir with respect to relative directory
for row in matched_files:
filename = row[0][4]
dir_structure = row[0][5].replace('public://', '').replace(filename, '')
path = os.path.join(OUTPUT_DIRECTORY, dir_structure)
try:
os.makedirs(path, exist_ok=False)
except:
print("DIRECTORY {} allready exists".format(OUTPUT_DIRECTORY))
destination = OUTPUT_DIRECTORY + '/' + dir_structure + filename
source = row[1].AbsoultePath #INPUT_DIRECTORY + '/' + filename #TODO: Add absolute path
if not os.path.exists(destination):
shutil.copy(source, destination)
else:
print(destination + ' does allready exists')
os.makedirs('logs', exist_ok=True)
logs = open('logs/log_{}_unmatched_files.txt'.format(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')), 'w')
log_template = '{},"{}","{}","{}","{}","{}","{}","{}","{}","{}"'
# logs.write('"fid","uuid","langcode","uid","filename","uri","filemime","filesize","status","created","changed"')
try:
logs.write(str(list(map(lambda x: x[5], files_not_found))))
except Exception as e:
print('Error: {}'.format(e))
print("""
SUMMARY
Found files {}
Not found files {}
Total files {}
""".format(str(len(matched_files)), str(len(files_not_found)), str(len(matched_files) + len(files_not_found))))
print('Done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment