Skip to content

Instantly share code, notes, and snippets.

@hyoiutu
Last active December 11, 2017 15:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hyoiutu/504bc9f379e169c490fc80dfc671d6db to your computer and use it in GitHub Desktop.
Save hyoiutu/504bc9f379e169c490fc80dfc671d6db to your computer and use it in GitHub Desktop.
Tweetの前処理
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import MeCab
from datetime import datetime
from joblib import Parallel, delayed
from time import time
from more_itertools import pairwise
def tokenizer_jp(sentence):
return MeCab.Tagger("-Owakati -d xxx/mylex").parse(sentence)
def creaning(sentence):
sentence = re.sub(r"@([A-Za-z0-9_]+)", "", sentence)
sentence = re.sub(r'https?:\/\/.*', "", sentence)
return sentence
def process(line_in, line_out, pattern):
line_in = tokenizer_jp(creaning(line_in))
line_out = tokenizer_jp(creaning(line_out))
if not re.match(pattern, line_in) and not re.match(pattern, line_out):
return (line_in, line_out)
return ()
pattern = re.compile(r"^\s*$")
start = time()
with open("tweets.txt","r",encoding="utf-8")as f, open("input.txt","w",encoding="utf-8")as f_in, open("output.txt","w",encoding="utf-8")as f_out:
result = Parallel(n_jobs=-1, verbose=7)([delayed(process)(line_in, line_out, pattern) for line_in, line_out in zip(f,f)])
for x in result:
if len(x) > 0:
f_in.write(x[0])
f_out.write(x[1])
print(f"result={time() - start}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment