Created
May 3, 2020 03:42
-
-
Save TakesxiSximada/30835e410d3d2705ffabd6fdaadee8f0 to your computer and use it in GitHub Desktop.
textextraction.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import pit | |
config = pit.Pit.get('bob.twitter', { | |
'require': { | |
'default_url': 'default url', | |
'consumer_key': 'your token', | |
'consumer_secret': 'your token', | |
'access_token': 'your token', | |
'access_token_secret': 'your token', | |
} | |
}) | |
default_url = config['default_url'] | |
from html.parser import HTMLParser | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
# stripしたテキストを保存するバッファー | |
self.fed = [] | |
def handle_data(self, d): | |
# 任意のタグの中身のみを追加していく | |
self.fed.append(d) | |
def convert_charrefs(self, *args, **kwds): | |
pass | |
def get_data(self): | |
# バッファーを連結して返す | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
from readability.readability import Document | |
import requests | |
res = requests.get(default_url) | |
doc = Document(res.text) | |
text = strip_tags(doc.summary()) | |
from janome.tokenizer import Tokenizer | |
tokenize = Tokenizer().tokenize | |
meisi = set() | |
josi = set() | |
MEISHI = '名詞' | |
IPPAN = '一般' | |
JOSHI = '助詞' | |
JODOUSHI = '助動詞' | |
DOUSHI = '動詞' | |
from zope.interface import ( | |
Interface, | |
Attribute, | |
implementer, | |
) | |
class IWord(Interface): | |
surface = Attribute('表記') | |
reading = Attribute('読み') | |
part = Attribute('品詞') | |
part_sub = Attribute('第二品詞') | |
part_property = Attribute('第三品詞') | |
part_sub_property = Attribute('第四品詞') | |
class Word: | |
def __init__(self, token, part, part_sub=None, part_property=None, part_sub_property=None): | |
self.token = token | |
self.part = part | |
self.part_sub = part_sub | |
self.part_property = part_property | |
self.part_sub_property = part_sub_property | |
class WordFactory: | |
default_word_class = Word | |
def __init__(self, word_class=None): | |
self.word_class = word_class = self.default_word_class if word_class is None else word_class | |
def __call__(self, token): | |
part_of_speech = token.part_of_speech.split(',') | |
return self.word_class(token, *part_of_speech) | |
create_word = WordFactory() | |
meishis = [] | |
joshis = [] | |
jodoushis = [] | |
doushis = [] | |
for token in tokenize(text): | |
word = create_word(token) | |
if word.part == MEISHI and word.part_sub == IPPAN: | |
meishis.append(word.token.surface) | |
if word.part == JOSHI: | |
joshis.append(word.token.surface) | |
if word.part == JODOUSHI: | |
jodoushis.append(word.token.surface) | |
if word.part == DOUSHI: | |
doushis.append(word.token.surface) | |
import random | |
sentence = random.choice(meishis) + random.choice(joshis) + random.choice(doushis) | |
import tweepy | |
consumer_key = config['consumer_key'] | |
consumer_secret = config['consumer_secret'] | |
access_token = config['access_token'] | |
access_token_secret = config['access_token_secret'] | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth) | |
print(sentence) | |
api.update_status(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment