Created
June 23, 2018 01:54
-
-
Save nozma/bbc6b51b42150e034c19847b1be1b175 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import re | |
import os | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
# 農薬登録情報速報(http://www.acis.famic.go.jp/searchF/index/index.html)をスクレイピングする | |
# ※FAMICの免責事項に従ってください(https://www.acis.famic.go.jp/index_kensaku.htm) | |
# 速報ページの取得 | |
url = "http://www.acis.famic.go.jp/searchF/index/" | |
resp = requests.get(url) | |
soup = BeautifulSoup(resp.text, "html.parser") | |
for tag in soup.find_all("a"): | |
link = tag.get("href") | |
# 1リンクで飛べる範囲のもの(比較的新しい情報)を対象とする | |
if link is not None and re.search("/", link) is None: | |
# リンクは相対パスなので絶対パスに変換 | |
surl = urljoin(url, link) | |
tables = pd.read_html(surl) | |
# URLから更新日を抽出してファイル名に使う | |
fname = re.match(r"(\d+)", link).group(1) + ".csv" | |
outputdir = "output/" | |
if not os.path.isdir(outputdir): | |
os.makedirs(outputdir) | |
pd.concat(tables).to_csv(outputdir + fname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment