Skip to content

Instantly share code, notes, and snippets.

@TakesxiSximada
Created May 3, 2020 03:42
Show Gist options
  • Save TakesxiSximada/30835e410d3d2705ffabd6fdaadee8f0 to your computer and use it in GitHub Desktop.
Save TakesxiSximada/30835e410d3d2705ffabd6fdaadee8f0 to your computer and use it in GitHub Desktop.
textextraction.py
#! /usr/bin/env python
import pit
config = pit.Pit.get('bob.twitter', {
'require': {
'default_url': 'default url',
'consumer_key': 'your token',
'consumer_secret': 'your token',
'access_token': 'your token',
'access_token_secret': 'your token',
}
})
default_url = config['default_url']
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
# stripしたテキストを保存するバッファー
self.fed = []
def handle_data(self, d):
# 任意のタグの中身のみを追加していく
self.fed.append(d)
def convert_charrefs(self, *args, **kwds):
pass
def get_data(self):
# バッファーを連結して返す
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
from readability.readability import Document
import requests
res = requests.get(default_url)
doc = Document(res.text)
text = strip_tags(doc.summary())
from janome.tokenizer import Tokenizer
tokenize = Tokenizer().tokenize
meisi = set()
josi = set()
MEISHI = '名詞'
IPPAN = '一般'
JOSHI = '助詞'
JODOUSHI = '助動詞'
DOUSHI = '動詞'
from zope.interface import (
Interface,
Attribute,
implementer,
)
class IWord(Interface):
surface = Attribute('表記')
reading = Attribute('読み')
part = Attribute('品詞')
part_sub = Attribute('第二品詞')
part_property = Attribute('第三品詞')
part_sub_property = Attribute('第四品詞')
class Word:
def __init__(self, token, part, part_sub=None, part_property=None, part_sub_property=None):
self.token = token
self.part = part
self.part_sub = part_sub
self.part_property = part_property
self.part_sub_property = part_sub_property
class WordFactory:
default_word_class = Word
def __init__(self, word_class=None):
self.word_class = word_class = self.default_word_class if word_class is None else word_class
def __call__(self, token):
part_of_speech = token.part_of_speech.split(',')
return self.word_class(token, *part_of_speech)
create_word = WordFactory()
meishis = []
joshis = []
jodoushis = []
doushis = []
for token in tokenize(text):
word = create_word(token)
if word.part == MEISHI and word.part_sub == IPPAN:
meishis.append(word.token.surface)
if word.part == JOSHI:
joshis.append(word.token.surface)
if word.part == JODOUSHI:
jodoushis.append(word.token.surface)
if word.part == DOUSHI:
doushis.append(word.token.surface)
import random
sentence = random.choice(meishis) + random.choice(joshis) + random.choice(doushis)
import tweepy
consumer_key = config['consumer_key']
consumer_secret = config['consumer_secret']
access_token = config['access_token']
access_token_secret = config['access_token_secret']
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
print(sentence)
api.update_status(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment