Skip to content

Instantly share code, notes, and snippets.

@s-fujimoto
Created December 3, 2017 07:11
Show Gist options
  • Save s-fujimoto/20d4e24af400de29effefbaa93d292f4 to your computer and use it in GitHub Desktop.
Save s-fujimoto/20d4e24af400de29effefbaa93d292f4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import twitter
import boto3
import os
keyword = 'dev.classmethod.jp'
region = 'us-east-1'
size = 10 * 100
def get_tweet_texts():
api = twitter.Api(consumer_key=os.environ['consumer_key'],
consumer_secret=os.environ['consumer_secret'],
access_token_key=os.environ['access_token_key'],
access_token_secret=os.environ['access_token_secret'],
sleep_on_rate_limit=True)
maxid = None
corpus = []
for i in range(int(size/100)):
results = api.GetSearch(term=keyword,result_type='recent',count=100,max_id=maxid)
maxid = min([result.id for result in results]) - 1
corpus.extend(results)
return corpus
def detect_language(corpus):
comprehend = boto3.client('comprehend', region_name=region)
batch_size = 25
result = {}
for tweets in [corpus[i:i+batch_size] for i in range(0, len(corpus), batch_size)]:
language_results = comprehend.batch_detect_dominant_language(
TextList=[tweet.text for tweet in tweets]
)
for languages in language_results['ResultList']:
for language in languages['Languages']:
code = language['LanguageCode']
score = language['Score']
result[code] = result[code] + score if result.get(code) else score
return result
def stdout(result):
sum_score = sum([value for key, value in result.items()])
[print('{} : {:.1f}%'.format(code, round(score/sum_score*100, 1))) for code, score in result.items()]
def main():
corpus = get_tweet_texts()
result = detect_language(corpus)
stdout(result)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment