Skip to content

Instantly share code, notes, and snippets.

@Shubhang
Last active April 11, 2023 16:13
Show Gist options
  • Save Shubhang/c3172f24513eb36772d70763219f35a8 to your computer and use it in GitHub Desktop.
Save Shubhang/c3172f24513eb36772d70763219f35a8 to your computer and use it in GitHub Desktop.
Group by Similarity
import os
import shutil
from difflib import SequenceMatcher
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
def main():
base_dir = '.' # Set the base directory to search for files
similarity_threshold = 0.6
file_groups = {}
for root, _, files in os.walk(base_dir):
for file in files:
file_prefix = file[:10] # Compare the first 10 characters of the file name
grouped = False
for group_prefix, group_files in file_groups.items():
if similarity(file_prefix, group_prefix) >= similarity_threshold:
group_files.append(file)
grouped = True
break
if not grouped:
file_groups[file_prefix] = [file]
# Move the files into their respective folders based on similarity
for group_prefix, group_files in file_groups.items():
new_folder_name = group_prefix + "_group"
new_folder_path = os.path.join(base_dir, new_folder_name)
if not os.path.exists(new_folder_path):
os.makedirs(new_folder_path)
for file in group_files:
file_path = os.path.join(root, file)
new_file_path = os.path.join(new_folder_path, file)
shutil.move(file_path, new_file_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment