Created
April 23, 2017 17:16
-
-
Save yoneda/3b9aa1b0f9220af8728ec5cab9a7ae3c to your computer and use it in GitHub Desktop.
preprocess for japanease language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import re | |
import jaconv | |
# @メンションを削除 | |
def removeMention(text): | |
startIndex = text.find("@") | |
endIndex = 0 | |
index = startIndex | |
while True: | |
s = text[index] | |
print s | |
if s==" ": | |
endIndex = index | |
break | |
index += 1 | |
cuttingText = text[startIndex:endIndex] | |
text = text.replace(cuttingText,"") | |
return text | |
# クリーニング処理 | |
def clean_text(text): | |
replacedText = re.sub(r"[@]\w+","",text) # @メンションの削除 | |
replacedText = re.sub(r"[#]\w+","",replacedText) # ハッシュタグの削除 | |
replacedText = re.sub(r"https?:\/\/.*?[\r\n ]","",replacedText) # urlの削除 | |
replacedText = re.sub(r" ","",replacedText) # 最後に半角を削除 | |
return replacedText | |
def main(): | |
text = u"@tarou1992 こんにちは!よろしくお願いします。iPhoneを修理に出してきた。新しいiphoneほしい。@hayasiy おやすみ!#bye イイネイイネいいね この動画面白い!https://www.youtube.com/watch?v=OEtDptT6diI バイト行ってきます。" | |
text = clean_text(text) | |
text = jaconv.h2z(text) # 半角カナ→全角カナ | |
text = jaconv.kata2hira(text) # 全角カナ→ひらがな | |
text = text.lower() # アルファベットの大文字→小文字 | |
print text | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment