Last active
June 21, 2023 06:25
-
-
Save perXautomatik/f81d928ccbcaa69f9fa68e9686f0f97d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--show the top 10 pairs of folders that have the most files in common. | |
-- Create a CTE named folders that selects the group ID, folder and filename from dupes table | |
-- and adds a column named full_path that combines the folder and filename with a backslash | |
with folders as ( | |
select [Group ID] as group_id, Folder || '\' || Filename as full_path from dupes | |
), | |
-- Create a CTE named split that recursively splits the full_path column into single characters | |
-- and appends them to a column named path_prefix, while keeping track of the last two characters | |
-- in columns last_char and common_denominator | |
split AS ( | |
select group_id, '' as path_prefix,'' as last_char, full_path as common_denominator from folders | |
UNION ALL | |
SELECT group_id, | |
path_prefix || substr(common_denominator, 1, 1) as path_prefix, | |
substr(common_denominator, 1, 1) as last_char, | |
substr(common_denominator, 2) as common_denominator FROM split | |
WHERE common_denominator <> '' | |
), | |
-- Create a CTE named folder_names that filters the rows from split. | |
-- This extracts the folder names from the original paths. | |
folder_names as ( | |
SELECT group_id, path_prefix, last_char, common_denominator FROM split | |
WHERE path_prefix != last_char and last_char = '\' and (common_denominator = '' OR last_char = '\') | |
) | |
-- Create a CTE named folder_pairs that performs a cross join between folder_names as left_folder | |
-- and folder_names as right_folder, where the common_denominator columns are equal and the path_prefix | |
-- columns are not. | |
-- This finds pairs of folders that have the same files inside them. | |
-- It also groups by the right_folder.group_id column and orders by the longest denominator column. | |
,folder_pairs as ( | |
select left_folder.path_prefix as left_folder_name, | |
right_folder.path_prefix as right_folder_name, | |
right_folder.group_id as right_group_id | |
from folder_names as left_folder cross join folder_names as right_folder | |
where left_folder.common_denominator = right_folder.common_denominator | |
and left_folder.path_prefix != right_folder.path_prefix | |
group by right_group_id | |
order by length(left_folder.common_denominator) | |
) | |
-- Finally, select the columns left_folder_name, right_folder_name and count from folder_pairs, | |
-- group by them, and order by count in descending order. It also limits the result to 10 rows. | |
-- This shows the top 10 pairs of folders that have the most files in common. | |
select left_folder_name,right_folder_name,count(*) as common_files from folder_pairs | |
group by left_folder_name,right_folder_name | |
order by common_files desc limit 10 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment