Created
November 9, 2015 15:58
-
-
Save bzz0217/fca9d06d47d734e27220 to your computer and use it in GitHub Desktop.
画像スクレイピング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
output_path="/var/www/html/m/" | |
nsurl="http://purl.org/rss/1.0/modules/content/" | |
rssurl="http://minkch.com/feed" | |
# 昨日 (UNIXTIME) | |
yesterday=`date -d '1 days ago' +'%s'` | |
count=1 | |
max=100 | |
while : | |
do | |
# RSS 投稿日時パス | |
pubDate_path="/rss/channel/item[${count}]/pubDate/text()" | |
# 投稿日時取得 | |
created_date=`echo "cat ${pubDate_path}" | xmllint --shell ${rssurl} | sed -n 2p` | |
echo "created_date:${created_date}" | |
# 記事投稿時間 (UNIXTIME) | |
pubDate=`date -d "${created_date}" +'%s'` | |
# 現在から24H以上前に投稿された または 100記事以上処理された場合 | |
if [ $pubDate -lt $yesterday -o $count -gt $max ]; then | |
break; | |
fi | |
# RSS 記事コンテンツパス | |
content_path="/rss/channel/item[${count}]/e:encoded" | |
content=`echo -e "setns e=${nsurl}\ncat ${content_path}" | xmllint --shell ${rssurl}` | |
url=(`echo ${content} | grep -o "<img src=\"http://imgs.minkch.com/[^<]*jpg\"" | sed -e "s/<img src=\"\([^<]*jpg\)\"/\1/"`) | |
for imgurl in ${url[@]}; do | |
# 画像ファイル名を生成 yyyymmdd_xxxx.jpg | |
filename=`echo ${imgurl} | sed -e "s/.*imgs\/\(.*\)\/\(.*jpg\)/\1_\2/"` | |
wget -O "${output_path}/${filename}" "${imgurl}" | |
done | |
count=`expr $count + 1` | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment