Skip to content

Instantly share code, notes, and snippets.

@bigsnarfdude
Last active February 14, 2024 17:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bigsnarfdude/d310b370fada00cdf0eb15b25f602928 to your computer and use it in GitHub Desktop.
Save bigsnarfdude/d310b370fada00cdf0eb15b25f602928 to your computer and use it in GitHub Desktop.
clean_transcripts_function.py
import os
import sys
def clean_duplicates(file_name):
duplicates = []
cleaned = []
with open(file_name, 'r') as f:
sentences = f.readlines()
for s in sentences:
if s in cleaned:
if s not in duplicates:
duplicates.append(s)
else:
cleaned.append(s)
return ' '.join(cleaned)
def process_directory(directory):
for file_name in os.listdir(directory):
if file_name.endswith('.txt'):
file_path = os.path.join(directory, file_name)
cleaned_text = clean_duplicates(file_path)
new_file_name = file_path.replace('.txt', '_cleaned.txt')
with open(new_file_name, 'w') as f:
f.write(cleaned_text)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script_name.py <directory_path>")
sys.exit(1)
directory_path = sys.argv[1]
process_directory(directory_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment