Skip to content

Instantly share code, notes, and snippets.

Created August 25, 2023 10:50
Show Gist options
  • Save yosignals/8ce4c2d1a8a39f08d65db84b080cdc65 to your computer and use it in GitHub Desktop.
Save yosignals/8ce4c2d1a8a39f08d65db84b080cdc65 to your computer and use it in GitHub Desktop.
Twitter dump duplicate joiner
import os
import re
# Directory path where the txt files are stored
folder_path = '.'
# Adjusted regular expressions to match the given line format
email_pattern = re.compile(r'Email: ([\w\.-]+@[\w\.-]+)')
screen_name_pattern = re.compile(r'ScreenName: (\S+)')
# Dictionary to store email addresses, their counts, and associated unique screen names
email_dict = {}
# Loop through each file in the directory
for filename in os.listdir(folder_path):
if filename.endswith('.txt'):
with open(os.path.join(folder_path, filename), 'r') as file:
for line_number, line in enumerate(file, 1): # Iterating over the file line by line
email_matches = email_pattern.findall(line)
screen_name_matches = screen_name_pattern.findall(line)
# Assuming one email and one screen name per line
if email_matches and screen_name_matches:
email = email_matches[0]
screen_name = screen_name_matches[0]
if email in email_dict:
email_dict[email]['screen_names'].add(screen_name) # Using a set to prevent duplicates
email_dict[email] = {'screen_names': {screen_name}}
# Write the summary to output.txt
with open('output.txt', 'w') as outfile:
found_duplicates = False
for email, data in email_dict.items():
# Checking if there are multiple unique screen names for the email
if len(data['screen_names']) > 1:
found_duplicates = True
outfile.write(f"Email: {email} has accounts with the following unique screen names:\n")
for screen_name in data['screen_names']:
outfile.write(f"- {screen_name}\n")
if not found_duplicates:
outfile.write("No email addresses with multiple unique screen names found.\n")
print("Summary written to output.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment