Last active
September 21, 2021 14:03
-
-
Save geojackass/43ad76f4579442415ef2e692bb3e9258 to your computer and use it in GitHub Desktop.
全中水泳2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tabula | |
import pandas as pd | |
df = tabula.read_pdf("dir/to/path", pages='all') | |
#PDFが複数枚になる場合に,複数のテーブルに分割される場合,indexをインクリメントする | |
df0 = df[0].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any') | |
df0 = df0.loc[:,["suiro","kamei","name","school","grade"]] | |
df0.head() | |
df1 = df[1].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any') | |
df1 = df1.loc[:,["suiro","kamei","name","school","grade"]] | |
df1.head() | |
#それぞれのテーブルの結果を結合した,1競技につき1つのテーブルに復元する | |
df = pd.concat([df0, df1]) | |
df.head() | |
#データディレクトリ内のstart listを一覧の形式で取得する | |
#ノンブルは競技種目となるため,順列ではない.ファイル書き出し時点では,ファイル番号によるソートもなしでよい. | |
#!!!ただし,競技番号004に関しては,スクレイピング時にテーブルが分割されるため,マージが必要!!! | |
import os | |
path = "dir/to/path" | |
files = os.listdir(path) | |
print(files) | |
#loopで自動処理を行う | |
import re | |
for i, v in enumerate(files): | |
fname = "path/to/dir" + v | |
#print(fname) | |
#競技種目(番号)と対応させて書き出し時のファイルネームにする | |
outfiles = (re.split("S|.pdf", v)[1]) + ".csv" | |
outdir = "path/to/dir" + outfiles | |
df = tabula.read_pdf(fname, pages='all') | |
df0 = df[0].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any') | |
df0 = df0.loc[:,["suiro","kamei","name","school","grade"]] | |
print(df0) | |
df0.to_csv(outdir, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment