Skip to content

Instantly share code, notes, and snippets.

@salty-horse
Created November 14, 2022 06:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save salty-horse/a30f478c90d5ea54e029bd2c5aa900b3 to your computer and use it in GitHub Desktop.
Save salty-horse/a30f478c90d5ea54e029bd2c5aa900b3 to your computer and use it in GitHub Desktop.
Extract Twitter display names from the Wayback Machine
#!/usr/bin/env python
# gem install --user-install wayback_machine_downloader
# wayback_machine_downloader -c 20 -s http://twitter.com/haszombiesinit
# find . -type f -iname '*html' | xargs grep '<title[^>]*>' | sort | ./extract_display_names.py
#
# Filter 'Tweets with replies by' manually'
import fileinput
import html
import os
import re
RE_TITLE = re.compile('<title[^>]*>')
ON_TWITTER = [
' on Twitter:',
' auf Twitter',
' sa Twitter',
' sur Twitter ',
' op Twitter',
' Twitter ನಲ್ಲಿ',
' على تويتر',
' ٹوئٹر پر',
' på Twitter',
' Twitterissä',
' Twitterren',
' di Twitter',
' na Twitterze',
' na Twitteri',
' บนทวิตเตอร์',
' Twitter वर',
' a Twitteren',
' su Twitter',
'さんのツイート',
' en Twitter',
' Twitter&#39;da',
' в Твиттере',
' 的 Twitter',
' בטוויטר',
' ar Twitter',
' pe Twitter',
' na Twitteru',
' на Твитеру',
' a Twitter',
]
known_names = {}
files_to_delete = []
for line in fileinput.input():
fname, line = line.split(':', 1)
date = fname.split('/')[4]
name = None
if '(@hasz' in line:
name = RE_TITLE.split(line.split('(@hasz', 1)[0])[1].strip()
else:
for on_twitter in ON_TWITTER:
if on_twitter in line:
name = RE_TITLE.split(line.split(on_twitter)[0])[1].strip()
if 'on Twitter:' in name:
print(repr(name[:100]))
break
if not name:
print('Unknown', date, line)
pass
if name:
if not (known_date := known_names.get(name)) or date < known_date:
known_names[name] = date
else:
files_to_delete.append(fname)
for (name, date) in sorted(known_names.items(), key=lambda x:x[1]):
print(f'{date[0:4]}-{date[4:6]}-{date[6:8]}\t{html.unescape(name)}')
# As files are downloaded, this deletes duplicate files
# for fname in files_to_delete:
# os.unlink(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment