Last active
November 1, 2022 05:58
-
-
Save gyakkun/fae171d09342af9b6f78a4d33c741ac9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
G_GIT_REPO_DIR=${BGM_GIT_REPO:-~/source/bgm-archive} | |
G_GROUP_TOPIC_AVOID_LIST=() | |
G_PWD=`pwd` | |
G_RET="" | |
BGM_RAUKEN_TOPICLIST_URL_TEMPLATE="https://%s/rakuen/topiclist?type=group" | |
BGM_RAUKEN_TOPICLIST_URL= | |
BGM_GROUP_TOPIC_URL_TEMPLATE="https://%s/m/topic/group" | |
BGM_GROUP_TOPIC_URL= | |
#BGM_DOMAIN_LIST=(bangumi.tv bgm.tv chii.in) | |
BGM_DOMAIN_LIST=(bgm.tv) | |
export http_proxy=192.168.3.15:1183 | |
export https_proxy=192.168.3.15:1183 | |
curlToFile() { | |
curl --connect-timeout 10 \ | |
--max-time 10 \ | |
--retry 6 \ | |
--retry-delay 3 \ | |
--retry-max-time 40 \ | |
-s -L --output $2 $1 | |
} | |
trimHtml() { | |
sed -i 's|<script.*</script>||g' $1 | |
sed -i 's|^[ \t]*||g' $1 | |
sed -i '/^$/d' $1 | |
} | |
randBgmDomain() { | |
G_RET=${BGM_DOMAIN_LIST["$[RANDOM % ${#BGM_DOMAIN_LIST[@]}]"]}; | |
} | |
currentTimeMills() { | |
G_RET=$(($(date +%s%N)/1000000)) | |
} | |
currentTimeISO() { | |
G_RET=$(date -u +'%Y-%m-%dT%H:%M:%S.%3NZ') | |
} | |
command_exists() { | |
command -v "$1" >/dev/null 2>&1 | |
} | |
mkdir -p $G_GIT_REPO_DIR/group | |
cd $G_GIT_REPO_DIR | |
echo `pwd` | |
git init | |
proxychains git pull | |
cd $G_PWD | |
echo `pwd` | |
currentTimeMills | |
echo $G_RET | |
currentTimeISO | |
echo $G_RET | |
randBgmDomain | |
echo random picked : $G_RET | |
printf -v BGM_RAUKEN_TOPICLIST_URL "$BGM_RAUKEN_TOPICLIST_URL_TEMPLATE" $G_RET | |
#echo after format $BGM_RAUKEN_TOPICLIST_URL $G_GIT_REPO_DIR/tmp.html | |
curlToFile $BGM_RAUKEN_TOPICLIST_URL $G_GIT_REPO_DIR/tmp.html | |
topic_list=`grep -Po '(?<=href="/rakuen/topic/group/)[0-9]+' $G_GIT_REPO_DIR/tmp.html | sort | uniq` | |
#Clear | |
> $G_GIT_REPO_DIR/group/topiclist.txt | |
for i in $topic_list | |
do | |
echo "$i" >> $G_GIT_REPO_DIR/group/topiclist.txt | |
done | |
topic_list=$(shuf -e "${topic_list[@]}") | |
topic_list=`grep -Po '(?<=href="/rakuen/topic/group/)[0-9]+' $G_GIT_REPO_DIR/tmp.html | sort | uniq | shuf` | |
echo $topic_list | |
for i in $topic_list | |
do | |
echo archiving group topic $i | |
if [[ " ${G_GROUP_TOPIC_AVOID_LIST[@]} " =~ " $i " ]] | |
then | |
echo $i is in AVOID LIST | |
else | |
ten_thousand=$(expr $i / 10000) | |
printf -v ten_thousand "%02d" $ten_thousand | |
hundred=$(expr $(expr $i % 10000) / 100) | |
printf -v hundred "%02d" $hundred | |
output_dir=$G_GIT_REPO_DIR/group/$ten_thousand/$hundred | |
output_loc=$output_dir/$i.html | |
mkdir -p $output_dir | |
randBgmDomain | |
printf -v BGM_GROUP_TOPIC_URL "$BGM_GROUP_TOPIC_URL_TEMPLATE" $G_RET | |
echo $BGM_GROUP_TOPIC_URL/$i to $output_loc | |
curlToFile $BGM_GROUP_TOPIC_URL/$i $output_loc | |
trimHtml $output_loc | |
if command_exists tidy; then | |
echo tidying | |
tidy --drop-empty-elements no \ | |
--tidy-mark no \ | |
--wrap 0 \ | |
--sort-attributes alpha \ | |
--quiet yes \ | |
--show-warnings no \ | |
-o $output_loc $output_loc | |
fi | |
sleep 1 | |
fi | |
done | |
cd $G_GIT_REPO_DIR | |
git_commit_msg="GROUP TOPIC: " | |
currentTimeISO | |
git_commit_msg+=" $G_RET " | |
currentTimeMills | |
git_commit_msg+="| $G_RET" | |
rm -rf tmp.html | |
git add * | |
git commit --allow-empty -m "$git_commit_msg" | |
proxychains git push | |
cd $G_PWD | |
echo "success" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment