Last active
October 9, 2017 15:34
-
-
Save palevell/9614f01f21bebefe5b1cfdb745c02105 to your computer and use it in GitHub Desktop.
Cleanup Twitter Followings with Ruby Gem 't' and Python's language detect module
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# t-clean.py v1.2.22 - Friday, September 15, 2017 | |
# Cleanup Twitter Followings with Ruby Gem 't' and Python's language detect module | |
"""" | |
Long Records (follow, unfollow, groupies, whoami, etc.) | |
0 ID | |
1 Since | |
2 Last tweeted at | |
3 Tweets | |
4 Favorites | |
5 Listed | |
6 Following | |
7 Followers | |
8 Screen name | |
9 Name | |
10 Verified | |
11 Protected | |
12 Bio | |
13 Status | |
14 Location | |
15 URL | |
""" | |
""" | |
Short Records (timeline) | |
0 ID | |
1 Posted at | |
2 Screen name | |
3 Text | |
""" | |
import csv, os, shutil, stat, sys, tempfile, time | |
from langdetect import detect, lang_detect_exception | |
from localnow import localnow | |
from random import randint | |
DEBUG = True | |
DRYRUN = True | |
logfile = 't-clean.log' | |
week_secs = 60 * 60 * 168 | |
languages = { 'en': True, | |
'ar': False, | |
'fa': False, | |
'hi': False | |
} | |
rejected = ('ar', 'fa', 'hi') | |
""" | |
languages = list() | |
accept = list() | |
reject = list() | |
neutral = list() | |
""" | |
def file_age_in_seconds(pathname): | |
return time.time() - os.stat(pathname)[stat.ST_MTIME] | |
# ToDo: move this to config.py | |
def taccts(): | |
tmpfile = tempfile.mkstemp('.txt', 't-accounts', '/tmp')[1] | |
cmd = "t accounts |grep -v '^ ' >" + tmpfile | |
accts = list() | |
if DEBUG: | |
accts2 = list() | |
accts2.append('Twit1') | |
accts2.append('Twit2') | |
try: | |
os.system(cmd) | |
with open(tmpfile, 'rt') as results: | |
line = 0 | |
for result in results.readlines(): | |
line += 1 | |
accts.append(result.rstrip('\n')) | |
os.remove(tmpfile) | |
except OSError as e: | |
print(" *** OSError", e.strerror) | |
sys.exit(e.errno) | |
if not DEBUG: | |
return sorted(accts) | |
else: | |
return sorted(accts2) | |
def run_t(acct, cmd, args=None, timestamp=False): | |
commands = ['followings', 'followers', 'whoami', 'follow', 'unfollow', 'timeline'] | |
needargs = ['follow', 'unfollow'] | |
usecsv = ['whoami', 'followings', 'followers', 'timeline'] | |
if not acct in taccts(): | |
raise("Unrecognized Twitter account: " + acct) | |
return None | |
if not cmd in commands: | |
raise("Unrecognized command: " + cmd) | |
return None | |
if cmd in needargs and args == None: | |
raise(cmd + " requires argument(s)") | |
return None | |
elif args == None: | |
args = '' | |
if cmd in usecsv: | |
csvopt = '--csv' | |
else: | |
csvopt = '' | |
if timestamp: | |
ts = '_' + localnow().strftime("%Y%m%d_%H%M%S") | |
else: | |
ts = '' | |
csvfile = os.path.join('local', 'data', acct + '_' + cmd + ts + '.csv') | |
if cmd in needargs and DRYRUN: | |
cmd += ' --dry-run' | |
# We can replace 't' with the full path, stored in t_cmd | |
oscmd = 't set active ' + acct + '>/dev/null && t ' + cmd + ' ' + csvopt + ' ' + args + ' >' + csvfile | |
try: | |
print(str(localnow().replace(microsecond=0)) + ' ' + oscmd) | |
fp.write(str(localnow().replace(microsecond=0)) + ' ' + oscmd + '\n') | |
os.system(oscmd) | |
# Try to abide by Twitter's rate limit . . . | |
time.sleep(randint(30, 60)) | |
except OSError as e: | |
print(" *** OSError:", e.strerror) | |
raise OSError | |
if os.path.exists(csvfile): | |
return csvfile | |
else: | |
return None | |
# Prequisites | |
# Check for Ruby gem 't' | |
t_cmd = shutil.which('t', mode=os.F_OK | os.X_OK, path=None) | |
if t_cmd == None: | |
print(" Unmet dependency: Ruby Gem 't'. Aborting.") | |
sys.exit() | |
# Main Program | |
grand_total = 0 | |
fp = open(logfile, 'at', encoding='utf-8') | |
for acct in taccts(): | |
print(acct, ". . . ", end='') | |
followersCSV = os.path.join('local', 'data', acct + '_followings.csv') | |
total = 0 | |
twits = '' | |
twitcount = 0 | |
try: | |
# if not os.path.exists(followersCSV): | |
# run_t(acct, 'followings') | |
try: | |
file_age = file_age_in_seconds(followersCSV) | |
except: | |
file_age = week_secs | |
if file_age >= week_secs: | |
run_t(acct, 'followings') | |
with open(followersCSV, 'rt') as csvfile: | |
next(csvfile) # Skip header row | |
for row in csv.reader(csvfile): | |
try: | |
tests = { 'Tweeted': detect(row[2]), | |
'Twit': detect(row[8]), | |
'Name': detect(row[9]), | |
'Bio': detect(row[12]), | |
'Status': detect(row[13]), | |
'Loc': detect(row[14]) | |
} | |
print(tests) | |
tweeted = row[2] # Last Tweet (Date) | |
twit = row[8] # Screen Name (Twitter handle) | |
name = row[9] # User (Display Name) | |
bio = row[12] # bio | |
status = row[13] # Status (Last Tweet) | |
loc = row[14] # location | |
score = 0 | |
# Detect and score language used in name, bio, and location | |
# Languages: | |
# en - English | |
# ar - Arabic | |
twit_lang = detect(twit) | |
name_lang = detect(name) | |
bio_lang = detect(bio) | |
loc_lang = detect(loc) | |
status_lang = detect(status) | |
print("Twit/Name/Location (%2s / %2s / %2): %16s / %25s / %25s\n\tBio (%2s): %160s\n\tStatus (%2s): %140s\n\n" % (twit_lang, name_lang, loc_lang, twit, name, loc, bio_lang, bio, status_lang, status)) | |
if name_lang in rejected: | |
score += 1 | |
if bio_lang in rejected: | |
score += 1 | |
if loc_lang in rejected: | |
score += 1 | |
if score >= 2: | |
twitcount += 1 | |
twits += ' ' + twit | |
if twitcount % 5 != 0: | |
eol = ' ' | |
else: | |
eol = '\n' | |
print(twit, end=eol) | |
if twitcount >= 20: | |
# process 20 twits at a time | |
run_t(acct, 'whoami', timestamp=True) | |
run_t(acct, 'unfollow', twits, timestamp=True) | |
twits = '' | |
total += twitcount | |
twitcount = 0 | |
except lang_detect_exception.LangDetectException as e: | |
pass | |
if twitcount > 0: | |
# unfollow remaining twits | |
run_t(acct, 'whoami', timestamp=True) | |
run_t(acct, 'unfollow', twits, timestamp=True) | |
total += twitcount | |
timelineCSV = run_t(acct, 'timeline', '-n100', timestamp=True) | |
print(timelineCSV) | |
with open(timelineCSV, 'rt') as csvfile: | |
next(csvfile) # Skip header row | |
twitset = set() | |
for row in csv.reader(csvfile): | |
try: | |
twit = row[2] # Screen Name (Twitter handle) | |
tweet = row[3] # Tweet | |
tweet_lang = detect(tweet) | |
if tweet_lang == 'en': | |
continue | |
elif tweet_lang in rejected: | |
if twit not in twitset: | |
twitset.add(twit) | |
twitcount += 1 | |
if twitcount % 5 != 0: | |
eol = ' ' | |
else: | |
eol = '\n' | |
print(twit, end=eol) | |
else: | |
print("%15s: (%2s) %s" % (twit, tweet_lang, tweet)) | |
if twitcount >= 20: | |
# process 20 twits at a time | |
twits = '' | |
while len(twitset) > 0: | |
twits += ' ' + twitset.pop() | |
twits.strip(' ') | |
run_t(acct, 'whoami', timestamp=True) | |
run_t(acct, 'unfollow', twits, timestamp=True) | |
twitset = set() | |
total += twitcount | |
twitcount = 0 | |
except lang_detect_exception.LangDetectException as e: | |
pass | |
if twitcount > 0: | |
# unfollow remaining twits found in timeline | |
twits = '' | |
while len(twitset) > 0: | |
twits += ' ' + twitset.pop() | |
twits.strip(' ') | |
run_t(acct, 'whoami', timestamp=True) | |
run_t(acct, 'unfollow', twits, timestamp=True) | |
total += twitcount | |
print("\tTotal twits:", str(total)) | |
grand_total += total | |
if total > 0: | |
run_t(acct, 'whoami', timestamp=True) | |
os.rename(followersCSV , followersCSV + '~.1') | |
except OSError as oe: | |
print(" *** OSError", oe.strerror) | |
sys.exit(oe.errno) | |
except TypeError as te: | |
print(" *** TypeError:", te) | |
print(te.__traceback__) | |
# print(te.with_traceback()) | |
sys.exit() | |
except Exception as e: | |
print(" *** Exception:", e) | |
sys.exit() | |
fp.close() | |
print("\nGrand total twits: ", str(grand_total)) | |
sys.exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment