Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save perXautomatik/f81d928ccbcaa69f9fa68e9686f0f97d to your computer and use it in GitHub Desktop.
Save perXautomatik/f81d928ccbcaa69f9fa68e9686f0f97d to your computer and use it in GitHub Desktop.
--show the top 10 pairs of folders that have the most files in common.
-- Create a CTE named folders that selects the group ID, folder and filename from dupes table
-- and adds a column named full_path that combines the folder and filename with a backslash
with folders as (
select [Group ID] as group_id, Folder || '\' || Filename as full_path from dupes
),
-- Create a CTE named split that recursively splits the full_path column into single characters
-- and appends them to a column named path_prefix, while keeping track of the last two characters
-- in columns last_char and common_denominator
split AS (
select group_id, '' as path_prefix,'' as last_char, full_path as common_denominator from folders
UNION ALL
SELECT group_id,
path_prefix || substr(common_denominator, 1, 1) as path_prefix,
substr(common_denominator, 1, 1) as last_char,
substr(common_denominator, 2) as common_denominator FROM split
WHERE common_denominator <> ''
),
-- Create a CTE named folder_names that filters the rows from split.
-- This extracts the folder names from the original paths.
folder_names as (
SELECT group_id, path_prefix, last_char, common_denominator FROM split
WHERE path_prefix != last_char and last_char = '\' and (common_denominator = '' OR last_char = '\')
)
-- Create a CTE named folder_pairs that performs a cross join between folder_names as left_folder
-- and folder_names as right_folder, where the common_denominator columns are equal and the path_prefix
-- columns are not.
-- This finds pairs of folders that have the same files inside them.
-- It also groups by the right_folder.group_id column and orders by the longest denominator column.
,folder_pairs as (
select left_folder.path_prefix as left_folder_name,
right_folder.path_prefix as right_folder_name,
right_folder.group_id as right_group_id
from folder_names as left_folder cross join folder_names as right_folder
where left_folder.common_denominator = right_folder.common_denominator
and left_folder.path_prefix != right_folder.path_prefix
group by right_group_id
order by length(left_folder.common_denominator)
)
-- Finally, select the columns left_folder_name, right_folder_name and count from folder_pairs,
-- group by them, and order by count in descending order. It also limits the result to 10 rows.
-- This shows the top 10 pairs of folders that have the most files in common.
select left_folder_name,right_folder_name,count(*) as common_files from folder_pairs
group by left_folder_name,right_folder_name
order by common_files desc limit 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment