Created
April 17, 2018 07:45
-
-
Save atuyosi/d9db3fd46bdd8d700ed7239fb5fcaadf to your computer and use it in GitHub Desktop.
陸自のイラク日報を某社のサイトからダウンロードするスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# encording: utf-8 | |
import requests | |
import shutil, os | |
import lxml.html | |
def get_pdf(session, url, filename): | |
# [python3のrequestsを使って画像を保存 - Qiita](https://qiita.com/pollenjp/items/0c39c35120cd60575647) | |
r = session.get(url, stream=True) | |
if r.status_code == 200: | |
with open(filename, 'wb') as f: | |
r.raw.decode_content = True | |
shutil.copyfileobj(r.raw, f) | |
def parse_url_and_download(xpath): | |
links = dom.xpath(xpath) | |
for n in links: | |
print(n.text) | |
download_url = n.attrib['href'] | |
filename = download_url.split('/')[-1] | |
get_pdf(session, download_url, filename) | |
session = requests.session() | |
url = "https://www.asahi.com/articles/ASL4J669JL4JUEHF016.html" | |
ua_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'} | |
res = session.get(url, headers=ua_header) | |
res.raise_for_status() | |
# soup = BeautifulSoup(res.text, "html5lib") | |
dom = lxml.html.fromstring(res.content) | |
node = dom.xpath('//*[@id="insert-mt-list"]/div[1]/h2') | |
print(node[0].text) | |
dirname = "イラク復興支援群" | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
parse_url_and_download('//*[@id="insert-mt-list"]/div[2]/table/tbody/tr/td[1]/a') | |
os.chdir('../') | |
node = dom.xpath('//*[@id="insert-mt-list"]/div[3]/h2') | |
print(node[0].text) | |
dirname = "イラク復興業務支援" | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
parse_url_and_download('//*[@id="insert-mt-list"]/div[4]/table/tbody/tr/td[1]/a') | |
os.chdir('../') | |
node = dom.xpath('//*[@id="insert-mt-list"]/div[5]/h2') | |
print(node[0].text) | |
dirname = "イラク後送業務隊" | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
parse_url_and_download('//*[@id="insert-mt-list"]/div[6]/table/tbody/tr/td[1]/a') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment