-
-
Save kemsakurai/1c36b2f5caa4959d5d6260707518b2db to your computer and use it in GitHub Desktop.
Markdown 文書の統計量を計算するスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
import markdown | |
from bs4 import BeautifulSoup | |
from markdown import markdown | |
CHARSET = 'utf-8' | |
MARKDOWN_STATS_FILE = "./check_results/markdown_stats.csv" | |
MARKDOWN_FILE_LIST = "./markdown_list.txt" | |
def main(): | |
results = [ | |
["ファイル名", "文字数", "句読点数", "h1の数", "h2の数", "h3の数", "h4の数", "h5の数", "h6の数", "tableの数", "liの数", "dtの数", "imgの数", | |
"aの数"]] | |
# ヘッダの設定 | |
with open(MARKDOWN_FILE_LIST, encoding=CHARSET) as f: | |
for line in f: | |
line = line.rstrip('\n') | |
with open(line, encoding=CHARSET) as markdown_file: | |
elems = [] | |
source = markdown_file.read() | |
html = markdown(source) | |
soup = BeautifulSoup(html, 'html.parser') | |
# 文章統計情報を取得 | |
# ファイル名 | |
elems.append(os.path.basename(line)) | |
# 文字数 | |
text = ''.join(soup.findAll(text=True)) | |
elems.append(len(text)) | |
elems.append(str(text.count("、") + text.count("。"))) | |
elems.append(len(soup.find_all('h1'))) | |
elems.append(len(soup.find_all('h2'))) | |
elems.append(len(soup.find_all('h3'))) | |
elems.append(len(soup.find_all('h4'))) | |
elems.append(len(soup.find_all('h5'))) | |
elems.append(len(soup.find_all('h6'))) | |
elems.append(len(soup.find_all('table'))) | |
elems.append(len(soup.find_all('li'))) | |
elems.append(len(soup.find_all('dt'))) | |
elems.append(len(soup.find_all('img'))) | |
elems.append(len(soup.find_all('a'))) | |
results.append(elems) | |
import csv | |
with open(MARKDOWN_STATS_FILE, "w+") as results_csv: | |
csv_writer = csv.writer(results_csv, delimiter=',') | |
csv_writer.writerows(results) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ファイル名 | 文字数 | 句読点数 | h1の数 | h2の数 | h3の数 | h4の数 | h5の数 | h6の数 | tableの数 | liの数 | dtの数 | imgの数 | aの数 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
wicketとdropwizardを連携する.md | 2339 | 37 | 1 | 1 | 0 | 7 | 5 | 0 | 0 | 8 | 0 | 0 | 8 | |
EclipseLinkのDDLにsemicolonを付与する.md | 1895 | 19 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | |
Apache Wicket 6でRestAPIを使う.md | 6944 | 57 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 14 | 0 | 0 | 20 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment