Last active
September 2, 2019 16:09
-
-
Save ctyo/fd803325eb6bf72d10d060a37724385d to your computer and use it in GitHub Desktop.
キャッシュレス消費者還元事業の一覧抽出
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# pdftotext kameiten_touroku_list.pdf でtxt化して読み込む | |
lines = []; | |
with open("kameiten_touroku_list.txt",'r', encoding="utf8", errors='ignore') as f: | |
lines = f.readlines() | |
regex = r'^[0-9,]+\s[北海道|青森|岩手|宮城|秋田|山形|福島|茨城|栃木|群馬|埼玉|千葉|東京|神奈川|新潟|富山|石川|福井|山梨|長野|岐阜|静岡|愛知|三重|滋賀|京都|大阪|兵庫|奈良|和歌山|鳥取|島根|岡山|広島|山口|徳島|香川|愛媛|高知|福岡|佐賀|長崎|熊本|大分|宮崎|鹿児島|沖縄]' | |
pattern = re.compile(regex) | |
for line in enumerate(lines): | |
match = pattern.match(line[1]) | |
if match : | |
i = line[0] | |
print(line[1].replace(' ', '\t').replace('\n','').replace('\r',''), end='\t') | |
print(lines[i+2].replace('\n','').replace('\r',''), end='\t') | |
print(lines[i+4].replace('\n','').replace('\r',''), end='\t') | |
print(lines[i+6].replace('\n','').replace('\r',''), end='\t') | |
print(lines[i+8].replace('\n','').replace('\r',''), end='\t') | |
return_per = lines[i+10].replace('\n','').replace('\r','') | |
if re.match('[0-9]+%', return_per): | |
print(return_per, end='\t') | |
print('') # 改行 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
PDF自体はここから入手