Created
March 11, 2022 17:07
-
-
Save SphinxKnight/60d766b8d6cd970b42eaf967b8dac3ff to your computer and use it in GitHub Desktop.
Check for duplicated files in mdn/translated-content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
ref_locale = 'en-us' | |
ref_path = 'content/files/' + ref_locale + '/' | |
locale = 'fr' | |
locale_path = 'translated-content/files/' + locale + '/' | |
dict_files={} | |
for r, d, f in os.walk(ref_path): | |
for file in f : | |
if not('.md' in file) and not('.html' in file): | |
full_path = os.path.join(r, file) | |
file_b = open(full_path,"rb") | |
content = file_b.read() | |
file_slug= full_path.split(ref_locale)[1] | |
dict_files[file_slug] = hashlib.sha256(content).hexdigest() | |
# print(dict_files) | |
spared_size = 0 | |
for r, d, f in os.walk(locale_path): | |
for file in f : | |
if not('.md' in file) and not('.html' in file): | |
full_path = os.path.join(r, file) | |
file_slug= full_path.split(locale)[1] | |
if file_slug in dict_files: | |
file_b = open(full_path,"rb") | |
content = file_b.read() | |
locale_file_hash = hashlib.sha256(content).hexdigest() | |
if locale_file_hash == dict_files[file_slug]: | |
spared_size = spared_size + os.path.getsize(full_path) | |
print(full_path) | |
print(spared_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I was using this one as a "one time" thing so I'm not really into plugging new features in it. If you need me to create a repo for this, so that it can be forked/extended, I can do that.