Skip to content

Instantly share code, notes, and snippets.

@FooQoo
Last active December 7, 2018 01:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FooQoo/c028e522d99b3209f58d035a89c802ee to your computer and use it in GitHub Desktop.
Save FooQoo/c028e522d99b3209f58d035a89c802ee to your computer and use it in GitHub Desktop.
ツイート中のストップワード取り除きを単語の分ち書きにするpythonスクリプト
'''
requirement : janome, pandas
'''
import re
from janome.tokenizer import Tokenizer
import unicodedata
from html import unescape
import pandas as pd
class Pretweet(object):
def __init__(self):
self.tokenizer = Tokenizer()
def cleaning(self, text):
text = ' ' + unescape(text) + ' '
stopwords = []
if text[1:3] == 'RT':
stopwords.append('RT')
stopwords += re.findall(r'\s@[a-zA-Z1-9]+\s', text)
stopwords += re.findall(r'\s#\w+\s', text)
stopwords += [g[0] for g in re.findall(r'((https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+))', text)]
return re.sub('(' + '|'.join(stopwords) + ')', '', text)[1:-1]
def annotation(self, text):
return [(token.base_form, token.part_of_speech.split(',')) for token in self.tokenizer.tokenize(text)]
def normalization(self, text):
return unicodedata.normalize("NFKC", text)
def is_not_my_stopword(self, word, attr):
if attr[0] == '名詞' and attr[1] in ['固有名詞', '一般', 'サ変接続']:
if (re.match(u'[一-龥ぁ-んァ-ンa-zA-Za-zA-Z1-91-9]', word) and len(word) >= 2) or (re.match(u'[一-龥]', word) and len(word) == 1):
return True
return False
def segmentation(self, text, cleaning=True, cleaning_pos=True, normalization=True):
text = self.cleaning(text) if cleaning else text
segment = []
for word, attr in self.annotation(text):
word = self.normalization(word) if normalization else word
if cleaning_pos and self.is_not_my_stopword(word, attr):
segment.append(self.normalization(word))
return ' '.join(segment)
if __name__ == "__main__":
pretweet = Pretweet()
lines = pd.read_csv('tweets.csv', encoding = 'utf-8').text.tolist()
with open('processed.txt', 'w', encoding='utf-8') as f:
for line in lines:
f.write('{}\n'.format(pretweet.segmentation(line)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment