Skip to content

Instantly share code, notes, and snippets.

@kitsuyui
Last active January 3, 2017 10:59
Show Gist options
  • Save kitsuyui/74574813abc4ee050f5e0a78f04a3393 to your computer and use it in GitHub Desktop.
Save kitsuyui/74574813abc4ee050f5e0a78f04a3393 to your computer and use it in GitHub Desktop.
Python で画像や URL やリプライを含まない日本語のツイートを集める ref: http://qiita.com/kitsuyui/items/b30296f13f71b287f14f
#!/bin/sh
export APP_KEY='XXXXXXXXXXXXX'
export APP_SECRET='XXXXXXXXXXXXXXXXXXXX'
export OAUTH_TOKEN='XXXXX-XXXXXXXXXX'
export OAUTH_TOKEN_SECRET='XXXXXXXXXX'
$ python tweetcorpus.py -n 10
$ while true; do python -u tweetcorpus.py -n 500 | tee /dev/tty | gzip -cn >> tweet.gz ; sleep 1 ; done
source ./.env
$ pip3 install twython==3.4.0
import argparse
import html
import os
import sys
from twython import TwythonStreamer
class CorpusStreamer(TwythonStreamer):
def __init__(self, *args,
max_corpus_tweets=100,
write_file=sys.stdout):
super().__init__(*args)
self.corpus_tweets = 0
self.max_corpus_tweets = max_corpus_tweets
self.write_file = write_file
def exit_when_corpus_tweets_exceeded(self):
if self.corpus_tweets >= self.max_corpus_tweets:
self.disconnect()
def write(self, text):
corpus_text = text.replace('\n', '\r')
self.write_file.write(corpus_text + '\n')
self.corpus_tweets += 1
def on_success(self, tweet):
if 'text' not in tweet:
# ツイート情報以外を除外 (通知など)
return
if 'retweeted_status' in tweet:
# リツイートを除外
return
if any(tweet['entities'].values()):
'''
tweet.entities.url
tweet.entities.media
tweet.entities.symbol
など自然言語処理だけでは扱えない情報を含むツイートを除外
'''
return
text = html.unescape(tweet['text'])
self.write(text)
self.exit_when_corpus_tweets_exceeded()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--number-of-corpus-tweets',
type=int, default=100)
parser.add_argument('-o', '--outfile',
type=argparse.FileType('w', encoding='UTF-8'),
default=sys.stdout)
parser.add_argument('-l', '--language', type=str, default='ja')
app_key = os.environ['APP_KEY']
app_secret = os.environ['APP_SECRET']
oauth_token = os.environ['OAUTH_TOKEN']
oauth_token_secret = os.environ['OAUTH_TOKEN_SECRET']
args = parser.parse_args()
stream = CorpusStreamer(app_key, app_secret,
oauth_token, oauth_token_secret,
max_corpus_tweets=args.number_of_corpus_tweets,
write_file=args.outfile)
stream.statuses.sample(language=args.language)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment