Skip to content

Instantly share code, notes, and snippets.

@PeterDing
Last active December 27, 2023 17:45
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save PeterDing/186332d7716fab7a64e2 to your computer and use it in GitHub Desktop.
Save PeterDing/186332d7716fab7a64e2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env sh
#iprange=(203.208.46.{130..220})
n=1
cookie="NID=67=ETJk9gHoHDXCITmIERCi_4pQee7RP15r21gmpJOtIdPa8t-ACkpYtTRDP5PLWSRbzTCDtc1G4mRlJlw3JTJquEWGQiP6BuYsrCTXr6Gy5ksZSkyL0V6ksaoQkKNhd8_5; GDSESS=ID=85624701b31e9e70:TM=1373093615:C=c:IP=116.54.58.190-:S=APGng0taF82U15BVoKTeDL68KRytKdQS4w; PREF=ID=b73cef8edd4ba5cd:U=eb68016d7d87d427:FF=0:LD=en:NW=1:CR=2:TM=1373093611:LM=1373093619:S=mz5KrkjrCVMTZObz"
#####################
# functions
#####################
geturl () {
#rd=$(($RANDOM % 80 + 135))
#t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp\/mono\/dvd\/`
###########
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp/ | grep -o -P "http://webcache.googleusercontent.com/[^<>']+?mono/dvd[^<>']+?detail[^<>']+?cid[^<>']+?$2" | head -n 1`
#t=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://www.google.com/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp/ | grep -o -P "http://webcache.googleusercontent.com/[^<>']+?mono/dvd[^<>']+?detail[^<>']+?cid[^<>']+?$2" | head -n 1`
###########
#t=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46."$rd"/search?q\=$1+$2+detail+cid+site:www.dmm.co.jp\/mono\/dvd\/ | grep -oP "http://webcache.googleusercontent.com/[^,<>']+?detail[^,<>']+?cid[^,<>=']+?$2" | head -n 1`
echo $t
}
url_opener () {
curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" "$1"
}
get_small_cover () {
echo $1 | grep -m 1 -oP "http://pics.dmm.co.jp/[^,<>']+?ps.jpg" | head -n 1
}
get_lage_cover () {
echo $1 | grep -m 1 -oP "http://pics.dmm.co.jp/[^,<>']+?pl.jpg" | head -n 1
}
get_scrots () {
t=`echo "$1" | grep -oP "http://pics.dmm.co.jp/digital/video/.+?.jpg"`
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]"
echo $t
}
get_ratings () {
t=`echo "$1" | sed -n 400,800p | grep -o -P "http://p.dmm.co.jp/p/ms/review/.+?\.gif"`
t=${t:31}
t=${t//_/\.}
t=${t/\.gif}
echo $t
}
get_id () {
t=$1
t=${t:39}
t=${t%/*}
echo $t
}
get_idt () {
t=$1
t=${t/-/}
t=${t,,}
echo $t
}
get_group () {
t=$1
t=${t%-*}
echo $t
}
get_title () {
t=`pcregrep -o -M "$1" $2`
#t=${t/>/}
#t=${t/</}
echo $t
}
get_artists () {
t=`grep -oP 'title="女优 \K.+?(?=")' $1`
#t=${t//title=\" /}
#t=${t//\"/}
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]"
echo $t
}
get_genre () {
t=`echo "$1" | grep -m 1 '</a>&nbsp;<a'`
t=`echo "$t" | grep -oP '\/">\K.+?(?=<\/a>)'`
#t=${t//\/\">/}
#t=${t//<\/a>/}
t="["\"`echo $t | sed 's/ /\\", \\"/g'`\""]"
echo $t
}
get_num () {
t=`tail -n 2 "$1" | head -n 1`
t=${t%% *}
echo $t
}
set_proxy () {
/tmp/t/liruqi-west-chamber/west-chamber-proxy/wcproxy sp &&
go_agent_url=`sed -n "$1"p go_agent_list`
sed -i "6cGOAGENT_FETCHHOST_LIST = \"$go_agent_url\""
/tmp/t/liruqi-west-chamber/west-chamber-proxy/wcproxy st
}
init () {
n=$1
n=$(($n + 1))
sed -i "\$d" "$2"
sed -i "$n,\$d" "$3"
}
####################
#####################
# av info
####################
#group=`echo "$id" | grep -o '^.*\-' | sed 's/\-//'`
main () {
file="$1"
id=`get_id "$file"`
idt=`get_idt "$id"`
#echo $idt
group=`get_group "$id"`
title=`get_title "作品名称</td>\n.+?<td>\K.+(?=</td>)" "$file"`
titlet=${title// /+}
#echo $titlet
runing_time=`get_title "片长</td>\n.+?<td>\K.+(?=</td>)" "$file"`
publisher=`get_title "发行商</td>\n.+?<td>\K.+(?=</td>)" "$file"`
publication_date=`get_title "出版日期</td>\n.+?<td>\K.+(?=</td>)" "$file"`
description=`get_title "class=\"desc\">\K.+?(?=</td>)" "$file"`
artists=`get_artists "$file"`
#artists=`get_artists 92 "$file"`
#echo $titlet
google_cache_dmm_url=`geturl "$titlet" "$idt"`
echo $google_cache_dmm_url
#if [[ ${#google_cache_dmm_url} -eq 0 ]]; then
#e=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q=google.com`
#while [[ ${#e} -lt 500 ]]; do
#set_proxy $n
#n=$((n+1))
#e=`curl -s -x "127.0.0.1:1998" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q=google.com`
#done
#fi
if [[ ${#google_cache_dmm_url} -eq 0 ]]; then
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=google.com`
while [[ ${#t} -lt 500 ]]; do
echo "-->" no google_cache_dmm_url
sleep 2m
t=`curl -s -b "$cookie" --retry 3 --connect-timeout 3 -A "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.45 Safari/537.36" http://203.208.46.200/search?q\=google.com`
done
google_cache_dmm_url=`geturl "$titlet" "$idt"`
##################
#if [[ ${#t} -lt 500 ]]; then
#echo "-->" no google_cache_dmm_url
#date
#exit 0
#fi
fi
###################################################
###################################################
html=`url_opener "$google_cache_dmm_url"`
#echo $html
if [[ ${#google_cache_dmm_url} -ne 0 ]]; then
while [[ ${#html} -lt 500 ]]; do
#echo $html
sleep 10s
html=`url_opener "$google_cache_dmm_url"`
#echo "-->" no html, $idt, $titlet
#exit 0
done
fi
ratings=`get_ratings "$html"`
#genret=`echo "$html" | sed -n 622p | grep -o -P '\/">.+?<\/a>' | sed 's/\/">//' | sed 's/<\/a>//'`
genre=`get_genre "$html"`
cover_s=`get_small_cover "$html" `
cover_l=`get_lage_cover "$html"`
scrots=`get_scrots "$html"`
item="\"$id\":[{\"group\":\"$group\"}, {\"ratings\":\"$ratings\"}, {\"artists\":$artists}, {\"title\":\"$title\"}, {\"genre\":$genre}, {\"runing_time\":\"$runing_time\"}, {\"publisher\":\"$publisher\"}, {\"publication_date\":\"$publication_date\"}, {\"description\":\"$description\"}, {\"cover_s\":\"$cover_s\"}, {\"cover_l\":\"$cover_l\"}, {\"scrots\":$scrots}, {\"google_cache_dmm_url\":\"$google_cache_dmm_url\"}],"
echo $2 $ratings $id $cover_l
echo $2 $ratings $id $cover_l >> log.finding
echo $item >> out
}
##########################
num=`get_num "log.finding"`
echo $num
num=${num:=0}
init $num "log.finding" "out"
num=$(($num + 1))
files=(`cat /home/peter/.nrop/fanhao/nindex`)
#files=(`cat "index"`)
while [[ $num -lt 45474 ]]; do
file=${files[$num]}
main $file $num
num=$((num+1))
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment