Skip to content

Instantly share code, notes, and snippets.

@atuyosi
Created April 17, 2018 07:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atuyosi/d9db3fd46bdd8d700ed7239fb5fcaadf to your computer and use it in GitHub Desktop.
Save atuyosi/d9db3fd46bdd8d700ed7239fb5fcaadf to your computer and use it in GitHub Desktop.
陸自のイラク日報を某社のサイトからダウンロードするスクリプト
#! /usr/bin/env python3
# encording: utf-8
import requests
import shutil, os
import lxml.html
def get_pdf(session, url, filename):
# [python3のrequestsを使って画像を保存 - Qiita](https://qiita.com/pollenjp/items/0c39c35120cd60575647)
r = session.get(url, stream=True)
if r.status_code == 200:
with open(filename, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def parse_url_and_download(xpath):
links = dom.xpath(xpath)
for n in links:
print(n.text)
download_url = n.attrib['href']
filename = download_url.split('/')[-1]
get_pdf(session, download_url, filename)
session = requests.session()
url = "https://www.asahi.com/articles/ASL4J669JL4JUEHF016.html"
ua_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
res = session.get(url, headers=ua_header)
res.raise_for_status()
# soup = BeautifulSoup(res.text, "html5lib")
dom = lxml.html.fromstring(res.content)
node = dom.xpath('//*[@id="insert-mt-list"]/div[1]/h2')
print(node[0].text)
dirname = "イラク復興支援群"
os.mkdir(dirname)
os.chdir(dirname)
parse_url_and_download('//*[@id="insert-mt-list"]/div[2]/table/tbody/tr/td[1]/a')
os.chdir('../')
node = dom.xpath('//*[@id="insert-mt-list"]/div[3]/h2')
print(node[0].text)
dirname = "イラク復興業務支援"
os.mkdir(dirname)
os.chdir(dirname)
parse_url_and_download('//*[@id="insert-mt-list"]/div[4]/table/tbody/tr/td[1]/a')
os.chdir('../')
node = dom.xpath('//*[@id="insert-mt-list"]/div[5]/h2')
print(node[0].text)
dirname = "イラク後送業務隊"
os.mkdir(dirname)
os.chdir(dirname)
parse_url_and_download('//*[@id="insert-mt-list"]/div[6]/table/tbody/tr/td[1]/a')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment