Skip to content

Instantly share code, notes, and snippets.

@geojackass
Last active September 21, 2021 14:03
Show Gist options
  • Save geojackass/43ad76f4579442415ef2e692bb3e9258 to your computer and use it in GitHub Desktop.
Save geojackass/43ad76f4579442415ef2e692bb3e9258 to your computer and use it in GitHub Desktop.
全中水泳2020
import tabula
import pandas as pd
df = tabula.read_pdf("dir/to/path", pages='all')
#PDFが複数枚になる場合に,複数のテーブルに分割される場合,indexをインクリメントする
df0 = df[0].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any')
df0 = df0.loc[:,["suiro","kamei","name","school","grade"]]
df0.head()
df1 = df[1].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any')
df1 = df1.loc[:,["suiro","kamei","name","school","grade"]]
df1.head()
#それぞれのテーブルの結果を結合した,1競技につき1つのテーブルに復元する
df = pd.concat([df0, df1])
df.head()
#データディレクトリ内のstart listを一覧の形式で取得する
#ノンブルは競技種目となるため,順列ではない.ファイル書き出し時点では,ファイル番号によるソートもなしでよい.
#!!!ただし,競技番号004に関しては,スクレイピング時にテーブルが分割されるため,マージが必要!!!
import os
path = "dir/to/path"
files = os.listdir(path)
print(files)
#loopで自動処理を行う
import re
for i, v in enumerate(files):
fname = "path/to/dir" + v
#print(fname)
#競技種目(番号)と対応させて書き出し時のファイルネームにする
outfiles = (re.split("S|.pdf", v)[1]) + ".csv"
outdir = "path/to/dir" + outfiles
df = tabula.read_pdf(fname, pages='all')
df0 = df[0].rename(columns={"水路":"suiro", "加 盟":"kamei","Unnamed: 0":"num","氏 名":"name","Unnamed: 1":"kana","所属名":"school","Unnamed: 2":"school_kana","学年":"grade"}).dropna(how='any')
df0 = df0.loc[:,["suiro","kamei","name","school","grade"]]
print(df0)
df0.to_csv(outdir, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment