Skip to content

Instantly share code, notes, and snippets.

@ki111
Last active December 28, 2017 19:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ki111/f84a859034b0bd84c576c7be4ca8a4bd to your computer and use it in GitHub Desktop.
Save ki111/f84a859034b0bd84c576c7be4ca8a4bd to your computer and use it in GitHub Desktop.
# coding: UTF-8
from bs4 import BeautifulSoup
import nltk
import time
import pandas as pd
# 変数定義
page = 2664
symbols = ["'", '"', '`', '.', ',', '-', '!', '?', ':', ';', '(', ')']
csv = open('word.csv','w')
# wordに単語、countに出現回数を格納してく
word = []
count = []
row_count = 0
new_flag = 0
i = 0
wo = 0
# 各ファイルに対して以下処理
for i in range(0,page):
# 取得した書き起こし英文を開く
file = open('./content/'+str(i)+'.html').read()
print('./content/'+str(i)+'.html')
# ドット後にスペースを挟み、単語がうまく分割されるようにする
file = file.replace(".",". ",1000)
# 文章を単語単位に分割
tokens = nltk.word_tokenize(file)
text = nltk.Text(tokens)
# 分割した各単語に対して以下処理 
for w in text:
# シンボルは除去
if w.lower() not in symbols:
row_count = 0
new_flag = 0
# 現ページの単語が今まで出てきたか調査
for m in word:
if w.lower() == m:
#出てきてたらカウント+1
count[row_count] = count[row_count] + 1
new_flag = 1
break
row_count = row_count + 1
# 出てきてなければ追加
if new_flag == 0:
word.append(w.lower())
count.append(1)
# 結果を書き込み
for w in word:
#単語中のカンマによりcsvが乱れるため削除
wo = w.replace(",","",1000)
csv.write(wo+','+str(count[i])+'\n')
i = i + 1
csv.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment