Skip to content

Instantly share code, notes, and snippets.

@palevell
Last active October 9, 2017 15:34
Show Gist options
  • Save palevell/9614f01f21bebefe5b1cfdb745c02105 to your computer and use it in GitHub Desktop.
Save palevell/9614f01f21bebefe5b1cfdb745c02105 to your computer and use it in GitHub Desktop.
Cleanup Twitter Followings with Ruby Gem 't' and Python's language detect module
#!/usr/bin/env python3
# t-clean.py v1.2.22 - Friday, September 15, 2017
# Cleanup Twitter Followings with Ruby Gem 't' and Python's language detect module
""""
Long Records (follow, unfollow, groupies, whoami, etc.)
0 ID
1 Since
2 Last tweeted at
3 Tweets
4 Favorites
5 Listed
6 Following
7 Followers
8 Screen name
9 Name
10 Verified
11 Protected
12 Bio
13 Status
14 Location
15 URL
"""
"""
Short Records (timeline)
0 ID
1 Posted at
2 Screen name
3 Text
"""
import csv, os, shutil, stat, sys, tempfile, time
from langdetect import detect, lang_detect_exception
from localnow import localnow
from random import randint
DEBUG = True
DRYRUN = True
logfile = 't-clean.log'
week_secs = 60 * 60 * 168
languages = { 'en': True,
'ar': False,
'fa': False,
'hi': False
}
rejected = ('ar', 'fa', 'hi')
"""
languages = list()
accept = list()
reject = list()
neutral = list()
"""
def file_age_in_seconds(pathname):
return time.time() - os.stat(pathname)[stat.ST_MTIME]
# ToDo: move this to config.py
def taccts():
tmpfile = tempfile.mkstemp('.txt', 't-accounts', '/tmp')[1]
cmd = "t accounts |grep -v '^ ' >" + tmpfile
accts = list()
if DEBUG:
accts2 = list()
accts2.append('Twit1')
accts2.append('Twit2')
try:
os.system(cmd)
with open(tmpfile, 'rt') as results:
line = 0
for result in results.readlines():
line += 1
accts.append(result.rstrip('\n'))
os.remove(tmpfile)
except OSError as e:
print(" *** OSError", e.strerror)
sys.exit(e.errno)
if not DEBUG:
return sorted(accts)
else:
return sorted(accts2)
def run_t(acct, cmd, args=None, timestamp=False):
commands = ['followings', 'followers', 'whoami', 'follow', 'unfollow', 'timeline']
needargs = ['follow', 'unfollow']
usecsv = ['whoami', 'followings', 'followers', 'timeline']
if not acct in taccts():
raise("Unrecognized Twitter account: " + acct)
return None
if not cmd in commands:
raise("Unrecognized command: " + cmd)
return None
if cmd in needargs and args == None:
raise(cmd + " requires argument(s)")
return None
elif args == None:
args = ''
if cmd in usecsv:
csvopt = '--csv'
else:
csvopt = ''
if timestamp:
ts = '_' + localnow().strftime("%Y%m%d_%H%M%S")
else:
ts = ''
csvfile = os.path.join('local', 'data', acct + '_' + cmd + ts + '.csv')
if cmd in needargs and DRYRUN:
cmd += ' --dry-run'
# We can replace 't' with the full path, stored in t_cmd
oscmd = 't set active ' + acct + '>/dev/null && t ' + cmd + ' ' + csvopt + ' ' + args + ' >' + csvfile
try:
print(str(localnow().replace(microsecond=0)) + ' ' + oscmd)
fp.write(str(localnow().replace(microsecond=0)) + ' ' + oscmd + '\n')
os.system(oscmd)
# Try to abide by Twitter's rate limit . . .
time.sleep(randint(30, 60))
except OSError as e:
print(" *** OSError:", e.strerror)
raise OSError
if os.path.exists(csvfile):
return csvfile
else:
return None
# Prequisites
# Check for Ruby gem 't'
t_cmd = shutil.which('t', mode=os.F_OK | os.X_OK, path=None)
if t_cmd == None:
print(" Unmet dependency: Ruby Gem 't'. Aborting.")
sys.exit()
# Main Program
grand_total = 0
fp = open(logfile, 'at', encoding='utf-8')
for acct in taccts():
print(acct, ". . . ", end='')
followersCSV = os.path.join('local', 'data', acct + '_followings.csv')
total = 0
twits = ''
twitcount = 0
try:
# if not os.path.exists(followersCSV):
# run_t(acct, 'followings')
try:
file_age = file_age_in_seconds(followersCSV)
except:
file_age = week_secs
if file_age >= week_secs:
run_t(acct, 'followings')
with open(followersCSV, 'rt') as csvfile:
next(csvfile) # Skip header row
for row in csv.reader(csvfile):
try:
tests = { 'Tweeted': detect(row[2]),
'Twit': detect(row[8]),
'Name': detect(row[9]),
'Bio': detect(row[12]),
'Status': detect(row[13]),
'Loc': detect(row[14])
}
print(tests)
tweeted = row[2] # Last Tweet (Date)
twit = row[8] # Screen Name (Twitter handle)
name = row[9] # User (Display Name)
bio = row[12] # bio
status = row[13] # Status (Last Tweet)
loc = row[14] # location
score = 0
# Detect and score language used in name, bio, and location
# Languages:
# en - English
# ar - Arabic
twit_lang = detect(twit)
name_lang = detect(name)
bio_lang = detect(bio)
loc_lang = detect(loc)
status_lang = detect(status)
print("Twit/Name/Location (%2s / %2s / %2): %16s / %25s / %25s\n\tBio (%2s): %160s\n\tStatus (%2s): %140s\n\n" % (twit_lang, name_lang, loc_lang, twit, name, loc, bio_lang, bio, status_lang, status))
if name_lang in rejected:
score += 1
if bio_lang in rejected:
score += 1
if loc_lang in rejected:
score += 1
if score >= 2:
twitcount += 1
twits += ' ' + twit
if twitcount % 5 != 0:
eol = ' '
else:
eol = '\n'
print(twit, end=eol)
if twitcount >= 20:
# process 20 twits at a time
run_t(acct, 'whoami', timestamp=True)
run_t(acct, 'unfollow', twits, timestamp=True)
twits = ''
total += twitcount
twitcount = 0
except lang_detect_exception.LangDetectException as e:
pass
if twitcount > 0:
# unfollow remaining twits
run_t(acct, 'whoami', timestamp=True)
run_t(acct, 'unfollow', twits, timestamp=True)
total += twitcount
timelineCSV = run_t(acct, 'timeline', '-n100', timestamp=True)
print(timelineCSV)
with open(timelineCSV, 'rt') as csvfile:
next(csvfile) # Skip header row
twitset = set()
for row in csv.reader(csvfile):
try:
twit = row[2] # Screen Name (Twitter handle)
tweet = row[3] # Tweet
tweet_lang = detect(tweet)
if tweet_lang == 'en':
continue
elif tweet_lang in rejected:
if twit not in twitset:
twitset.add(twit)
twitcount += 1
if twitcount % 5 != 0:
eol = ' '
else:
eol = '\n'
print(twit, end=eol)
else:
print("%15s: (%2s) %s" % (twit, tweet_lang, tweet))
if twitcount >= 20:
# process 20 twits at a time
twits = ''
while len(twitset) > 0:
twits += ' ' + twitset.pop()
twits.strip(' ')
run_t(acct, 'whoami', timestamp=True)
run_t(acct, 'unfollow', twits, timestamp=True)
twitset = set()
total += twitcount
twitcount = 0
except lang_detect_exception.LangDetectException as e:
pass
if twitcount > 0:
# unfollow remaining twits found in timeline
twits = ''
while len(twitset) > 0:
twits += ' ' + twitset.pop()
twits.strip(' ')
run_t(acct, 'whoami', timestamp=True)
run_t(acct, 'unfollow', twits, timestamp=True)
total += twitcount
print("\tTotal twits:", str(total))
grand_total += total
if total > 0:
run_t(acct, 'whoami', timestamp=True)
os.rename(followersCSV , followersCSV + '~.1')
except OSError as oe:
print(" *** OSError", oe.strerror)
sys.exit(oe.errno)
except TypeError as te:
print(" *** TypeError:", te)
print(te.__traceback__)
# print(te.with_traceback())
sys.exit()
except Exception as e:
print(" *** Exception:", e)
sys.exit()
fp.close()
print("\nGrand total twits: ", str(grand_total))
sys.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment