Last active
December 28, 2017 19:14
-
-
Save ki111/f84a859034b0bd84c576c7be4ca8a4bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
from bs4 import BeautifulSoup | |
import nltk | |
import time | |
import pandas as pd | |
# 変数定義 | |
page = 2664 | |
symbols = ["'", '"', '`', '.', ',', '-', '!', '?', ':', ';', '(', ')'] | |
csv = open('word.csv','w') | |
# wordに単語、countに出現回数を格納してく | |
word = [] | |
count = [] | |
row_count = 0 | |
new_flag = 0 | |
i = 0 | |
wo = 0 | |
# 各ファイルに対して以下処理 | |
for i in range(0,page): | |
# 取得した書き起こし英文を開く | |
file = open('./content/'+str(i)+'.html').read() | |
print('./content/'+str(i)+'.html') | |
# ドット後にスペースを挟み、単語がうまく分割されるようにする | |
file = file.replace(".",". ",1000) | |
# 文章を単語単位に分割 | |
tokens = nltk.word_tokenize(file) | |
text = nltk.Text(tokens) | |
# 分割した各単語に対して以下処理 | |
for w in text: | |
# シンボルは除去 | |
if w.lower() not in symbols: | |
row_count = 0 | |
new_flag = 0 | |
# 現ページの単語が今まで出てきたか調査 | |
for m in word: | |
if w.lower() == m: | |
#出てきてたらカウント+1 | |
count[row_count] = count[row_count] + 1 | |
new_flag = 1 | |
break | |
row_count = row_count + 1 | |
# 出てきてなければ追加 | |
if new_flag == 0: | |
word.append(w.lower()) | |
count.append(1) | |
# 結果を書き込み | |
for w in word: | |
#単語中のカンマによりcsvが乱れるため削除 | |
wo = w.replace(",","",1000) | |
csv.write(wo+','+str(count[i])+'\n') | |
i = i + 1 | |
csv.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment