Skip to content

Instantly share code, notes, and snippets.

@nozma
Created June 23, 2018 01:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nozma/bbc6b51b42150e034c19847b1be1b175 to your computer and use it in GitHub Desktop.
Save nozma/bbc6b51b42150e034c19847b1be1b175 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
import re
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 農薬登録情報速報(http://www.acis.famic.go.jp/searchF/index/index.html)をスクレイピングする
# ※FAMICの免責事項に従ってください(https://www.acis.famic.go.jp/index_kensaku.htm)
# 速報ページの取得
url = "http://www.acis.famic.go.jp/searchF/index/"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup.find_all("a"):
link = tag.get("href")
# 1リンクで飛べる範囲のもの(比較的新しい情報)を対象とする
if link is not None and re.search("/", link) is None:
# リンクは相対パスなので絶対パスに変換
surl = urljoin(url, link)
tables = pd.read_html(surl)
# URLから更新日を抽出してファイル名に使う
fname = re.match(r"(\d+)", link).group(1) + ".csv"
outputdir = "output/"
if not os.path.isdir(outputdir):
os.makedirs(outputdir)
pd.concat(tables).to_csv(outputdir + fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment