-
-
Save PeterDing/186332d7716fab7a64e2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
#iprange=(203.208.46.{130..220}) | |
n=1 | |
cookie="NID=67=ETJk9gHoHDXCITmIERCi_4pQee7RP15r21gmpJOtIdPa8t-ACkpYtTRDP5PLWSRbzTCDtc1G4mRlJlw3JTJquEWGQiP6BuYsrCTXr6Gy5ksZSkyL0V6ksaoQkKNhd8_5; GDSESS=ID=85624701b31e9e70:TM=1373093615:C=c:IP=116.54.58.190-:S=APGng0taF82U15BVoKTeDL68KRytKdQS4w; PREF=ID=b73cef8edd4ba5cd:U=eb68016d7d87d427:FF=0:LD=en:NW=1:CR=2:TM=1373093611:LM=1373093619:S=mz5KrkjrCVMTZObz" | |
##################### | |
# functions | |
##################### | |
geturl () { | |
#rd=$(($RANDOM % 80 + 135)) | |
#t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp\/mono\/dvd\/` | |
########### | |
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp/ | grep -o -P "http://webcache.googleusercontent.com/[^<>']+?mono/dvd[^<>']+?detail[^<>']+?cid[^<>']+?$2" | head -n 1` | |
#t=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://www.google.com/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp/ | grep -o -P "http://webcache.googleusercontent.com/[^<>']+?mono/dvd[^<>']+?detail[^<>']+?cid[^<>']+?$2" | head -n 1` | |
########### | |
#t=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46."$rd"/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp\/mono\/dvd\/ | grep -oP "http://webcache.googleusercontent.com/[^,<>']+?detail[^,<>']+?cid[^,<>=']+?$2" | head -n 1` | |
echo $t | |
} | |
url_opener () { | |
curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" "$1" | |
} | |
get_small_cover () { | |
echo $1 | grep -m 1 -oP "http://pics.dmm.co.jp/[^,<>']+?ps.jpg" | head -n 1 | |
} | |
get_lage_cover () { | |
echo $1 | grep -m 1 -oP "http://pics.dmm.co.jp/[^,<>']+?pl.jpg" | head -n 1 | |
} | |
get_scrots () { | |
t=`echo "$1" | grep -oP "http://pics.dmm.co.jp/digital/video/.+?.jpg"` | |
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]" | |
echo $t | |
} | |
get_ratings () { | |
t=`echo "$1" | sed -n 400,800p | grep -o -P "http://p.dmm.co.jp/p/ms/review/.+?\.gif"` | |
t=${t:31} | |
t=${t//_/\.} | |
t=${t/\.gif} | |
echo $t | |
} | |
get_id () { | |
t=$1 | |
t=${t:39} | |
t=${t%/*} | |
echo $t | |
} | |
get_idt () { | |
t=$1 | |
t=${t/-/} | |
t=${t,,} | |
echo $t | |
} | |
get_group () { | |
t=$1 | |
t=${t%-*} | |
echo $t | |
} | |
get_title () { | |
t=`pcregrep -o -M "$1" $2` | |
#t=${t/>/} | |
#t=${t/</} | |
echo $t | |
} | |
get_artists () { | |
t=`grep -oP 'title="女优 \K.+?(?=")' $1` | |
#t=${t//title=\" /} | |
#t=${t//\"/} | |
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]" | |
echo $t | |
} | |
get_genre () { | |
t=`echo "$1" | grep -m 1 '</a> <a'` | |
t=`echo "$t" | grep -oP '\/">\K.+?(?=<\/a>)'` | |
#t=${t//\/\">/} | |
#t=${t//<\/a>/} | |
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]" | |
echo $t | |
} | |
get_num () { | |
t=`tail -n 2 "$1" | head -n 1` | |
t=${t%% *} | |
echo $t | |
} | |
set_proxy () { | |
/tmp/t/liruqi-west-chamber/west-chamber-proxy/wcproxy sp && | |
go_agent_url=`sed -n "$1"p go_agent_list` | |
sed -i "6cGOAGENT_FETCHHOST_LIST = \"$go_agent_url\"" | |
/tmp/t/liruqi-west-chamber/west-chamber-proxy/wcproxy st | |
} | |
init () { | |
n=$1 | |
n=$(($n + 1)) | |
sed -i "\$d" "$2" | |
sed -i "$n,\$d" "$3" | |
} | |
#################### | |
##################### | |
# av info | |
#################### | |
#group=`echo "$id" | grep -o '^.*\-' | sed 's/\-//'` | |
main () { | |
file="$1" | |
id=`get_id "$file"` | |
idt=`get_idt "$id"` | |
#echo $idt | |
group=`get_group "$id"` | |
title=`get_title "作品名称</td>\n.+?<td>\K.+(?=</td>)" "$file"` | |
titlet=${title// /+} | |
#echo $titlet | |
runing_time=`get_title "片长</td>\n.+?<td>\K.+(?=</td>)" "$file"` | |
publisher=`get_title "发行商</td>\n.+?<td>\K.+(?=</td>)" "$file"` | |
publication_date=`get_title "出版日期</td>\n.+?<td>\K.+(?=</td>)" "$file"` | |
description=`get_title "class=\"desc\">\K.+?(?=</td>)" "$file"` | |
artists=`get_artists "$file"` | |
#artists=`get_artists 92 "$file"` | |
#echo $titlet | |
google_cache_dmm_url=`geturl "$titlet" "$idt"` | |
echo $google_cache_dmm_url | |
#if [[ ${#google_cache_dmm_url} -eq 0 ]]; then | |
#e=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q=google.com` | |
#while [[ ${#e} -lt 500 ]]; do | |
#set_proxy $n | |
#n=$((n+1)) | |
#e=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q=google.com` | |
#done | |
#fi | |
if [[ ${#google_cache_dmm_url} -eq 0 ]]; then | |
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=google.com` | |
while [[ ${#t} -lt 500 ]]; do | |
echo "-->" no google_cache_dmm_url | |
sleep 2m | |
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=google.com` | |
done | |
google_cache_dmm_url=`geturl "$titlet" "$idt"` | |
################## | |
#if [[ ${#t} -lt 500 ]]; then | |
#echo "-->" no google_cache_dmm_url | |
#date | |
#exit 0 | |
#fi | |
fi | |
################################################### | |
################################################### | |
html=`url_opener "$google_cache_dmm_url"` | |
#echo $html | |
if [[ ${#google_cache_dmm_url} -ne 0 ]]; then | |
while [[ ${#html} -lt 500 ]]; do | |
#echo $html | |
sleep 10s | |
html=`url_opener "$google_cache_dmm_url"` | |
#echo "-->" no html, $idt, $titlet | |
#exit 0 | |
done | |
fi | |
ratings=`get_ratings "$html"` | |
#genret=`echo "$html" | sed -n 622p | grep -o -P '\/">.+?<\/a>' | sed 's/\/">//' | sed 's/<\/a>//'` | |
genre=`get_genre "$html"` | |
cover_s=`get_small_cover "$html" ` | |
cover_l=`get_lage_cover "$html"` | |
scrots=`get_scrots "$html"` | |
item="\"$id\":[{\"group\":\"$group\"}, {\"ratings\":\"$ratings\"}, {\"artists\":$artists}, {\"title\":\"$title\"}, {\"genre\":$genre}, {\"runing_time\":\"$runing_time\"}, {\"publisher\":\"$publisher\"}, {\"publication_date\":\"$publication_date\"}, {\"description\":\"$description\"}, {\"cover_s\":\"$cover_s\"}, {\"cover_l\":\"$cover_l\"}, {\"scrots\":$scrots}, {\"google_cache_dmm_url\":\"$google_cache_dmm_url\"}]," | |
echo $2 $ratings $id $cover_l | |
echo $2 $ratings $id $cover_l >> log.finding | |
echo $item >> out | |
} | |
########################## | |
num=`get_num "log.finding"` | |
echo $num | |
num=${num:=0} | |
init $num "log.finding" "out" | |
num=$(($num + 1)) | |
files=(`cat /home/peter/.nrop/fanhao/nindex`) | |
#files=(`cat "index"`) | |
while [[ $num -lt 45474 ]]; do | |
file=${files[$num]} | |
main $file $num | |
num=$((num+1)) | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment