Skip to content

Instantly share code, notes, and snippets.

@kism
Created July 29, 2024 01:14
Show Gist options
  • Save kism/cf018aa87e429fe4796201502ca6e8af to your computer and use it in GitHub Desktop.
Save kism/cf018aa87e429fe4796201502ca6e8af to your computer and use it in GitHub Desktop.
Replicate/replace file modified times
#!/usr/bin/env python3
"""
./date_replicate.py my_folder/original my_folder/copied
Will find files with same sizes and hash (first 10MB) but different modified times.
Update the second folder's matching files with the first folders files modified time.
"""
import os
import sys
import datetime
import hashlib
import json
def get_files_with_size_and_mtime(directory):
files_info = {}
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
try:
file_size = os.path.getsize(file_path)
file_mtime = os.path.getmtime(file_path)
if file_size not in files_info:
files_info[file_size] = []
files_info[file_size].append((file_path, file_mtime))
except OSError as e:
print(f"Error accessing file {file_path}: {e}")
return files_info
def get_file_checksum(file_path):
hash_algo = hashlib.sha256()
chunk_size = 4096 # 4KB chunks
with open(file_path, "rb") as f:
file_size = f.seek(0, 2) # Move to the end of the file to get the file size
f.seek(0) # Move back to the start of the file
if file_size >= 4 * 1024 * 1024: # 4MB in bytes
bytes_to_read = min(file_size, 10 * 1024 * 1024) # 10MB in bytes
else:
bytes_to_read = file_size
while bytes_to_read > 0:
chunk = f.read(min(chunk_size, bytes_to_read))
if not chunk:
break
hash_algo.update(chunk)
bytes_to_read -= len(chunk)
return hash_algo.hexdigest()
def compare_checksums(file_path1, file_path2):
"""Compare the SHA-256 checksums of two files."""
checksum1 = get_file_checksum(file_path1)
checksum2 = get_file_checksum(file_path2)
if checksum1 is None or checksum2 is None:
return False, "One or both files do not exist."
if checksum1 == checksum2:
return True, "The checksums match."
else:
return False, "The checksums do not match."
def compare_files_in_directories(dir1, dir2):
dir1_files = get_files_with_size_and_mtime(dir1)
dir2_files = get_files_with_size_and_mtime(dir2)
print("Files with same size but different modified time:")
try:
for size, files in dir1_files.items():
if size in dir2_files:
for file1, mtime1 in files:
for file2, mtime2 in dir2_files[size]:
if mtime1 != mtime2:
# print(f"{file1} (Modified Time: {mtime1}), {file2} (Modified Time: {mtime2})")
print("")
if os.path.basename(file1) == os.path.basename(file2):
print(f"{os.path.basename(file1)}")
else:
print(f"{os.path.basename(file1)}")
print(f"{os.path.basename(file2)}")
result, message = compare_checksums(file1, file2)
if result:
time_diff = datetime.datetime.fromtimestamp(
mtime2
) - datetime.datetime.fromtimestamp(mtime1)
if time_diff.total_seconds() > 0:
print(
f"Checksums match, time diff: {time_diff.total_seconds()} appending to list."
)
out_list.append(
{
"path": file2,
"time_new": mtime1,
"time_original": mtime2,
}
)
except KeyboardInterrupt:
pass
def get_next_filename(base_filename):
if not os.path.exists(base_filename):
return base_filename
filename, file_extension = os.path.splitext(base_filename)
i = 1
while True:
new_filename = f"{filename}_{i}{file_extension}"
if not os.path.exists(new_filename):
return new_filename
i += 1
dir1 = sys.argv[1]
dir2 = sys.argv[2]
out_list = []
compare_files_in_directories(dir1, dir2)
filename = get_next_filename("backup_date_replicate.json")
with open(filename, "w") as f:
json.dump(out_list, f)
for entry in out_list:
target_time = datetime.datetime.fromtimestamp(entry["time_new"])
print(f'{entry["path"]}. New time: {target_time}')
os.utime(entry["path"], (entry["time_new"], entry["time_new"]))
#!/usr/bin/env python3
"""
./finddate.py my_folder
Will search for files modified between dates, and set modification to the TARGET_DATE_STR.
See the constants for the start and end dates to search for.
"""
import sys
import os
import random
import json
from datetime import datetime, timedelta
START_DATE = "2024-01-01"
END_DATE = "2024-01-02"
TARGET_DATE_STR = "2021-10-22"
def get_next_filename(base_filename):
if not os.path.exists(base_filename):
return base_filename
filename, file_extension = os.path.splitext(base_filename)
i = 1
while True:
new_filename = f"{filename}_{i}{file_extension}"
if not os.path.exists(new_filename):
return new_filename
i += 1
def find_files_within_dates(directory, START_DATE, END_DATE):
# Convert start and end dates to datetime objects
START_DATE = datetime.strptime(START_DATE, "%Y-%m-%d")
END_DATE = datetime.strptime(END_DATE, "%Y-%m-%d")
matching_files = []
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
if START_DATE <= file_mod_time <= END_DATE:
matching_files.append(
{"file": file_path, "mod_time": file_mod_time.timestamp()}
)
return matching_files
directory_to_search = sys.argv[1]
target_date = datetime.strptime(TARGET_DATE_STR, "%Y-%m-%d")
files = find_files_within_dates(directory_to_search, START_DATE, END_DATE)
filename = get_next_filename("backup_find_date.json")
with open(filename, "w") as f:
json.dump(files, f)
for file in files:
original_time = datetime.fromtimestamp(file["mod_time"])
new_time = target_date + timedelta(microseconds=(random.randint(0, 1000000)))
new_time_epoch = new_time.timestamp()
print()
print(
f"{file['file']} was modified on {original_time.strftime('%Y-%m-%d %H:%M:%S.%f')}"
)
print(f"new time: {new_time.strftime('%Y-%m-%d %H:%M:%S.%f')}")
os.utime(file["file"], times=(new_time_epoch, new_time_epoch))
#!/usr/bin/env bash
# Generate test dirs and files for the two scripts
mkdir my_folder
mkdir my_folder/original
mkdir my_folder/copied
rm my_folder/original/*
rm my_folder/copied/*
for i in {1..10}
do
dd if=/dev/urandom of=my_folder/original/file$i.bin bs=1M count=$i
touch -m -t 202401010000 my_folder/original/file$i.bin
cp my_folder/original/file$i.bin my_folder/copied/file$i.bin
touch -m -t 202401010000 my_folder/copied/file$i.bin
done
touch -m -t 202001010000 my_folder/original/file1.bin
touch -m -t 202001010000 my_folder/original/file2.bin
touch -m -t 202001010000 my_folder/original/file3.bin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment