Skip to content

Instantly share code, notes, and snippets.

@konoha81
Last active June 27, 2017 11:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save konoha81/628fe147c48ff2925830 to your computer and use it in GitHub Desktop.
Save konoha81/628fe147c48ff2925830 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use LWP::Simple;
use Crypt::SSLeay;
use XML::Simple;
use Data::Dumper;
print("\n***** 最新のpostから指定枚数だけ遡り、スコア順に並び替えて画像を取得します。\
*****\n");
print("***** 取得したい枚数およびタグと、うち上位何件を取得したいをかを入力してくださ\
い。*****\n");
print("***** control+x → control+c でキャンセルが出来ます。*****\n\n");
print("取得したい枚数: ");
my $post = <STDIN>;
chomp($post);
if($post !~ /^[1-9]+/){
die "入力は半角の正の整数のみ可能です。\n";
}
print("取得したいタグ\n(指定しない場合はEnter。タグは1つのみ):");
my $fetch_tags = <STDIN>;
chomp($fetch_tags);
# タグが入力されない場合の処理
if($fetch_tags eq ''){
$fetch_tags = "";
}
=puts
print("上位何件: ");
my $rank = <STDIN>;
chomp($rank);
if($rank<=$post){
}else{
die "取得したい枚数より少ない数を入力してください。\n";
}
=cut
print("score順?好み順? --- type [s] or [p]: ");
my $sorting = <STDIN>;
chomp($sorting);
if($sorting !~ /s|p/){
die "入力は s か p のみ可能です。\n";
}
#idをキーするタグとurlのハッシュ
my %tags_hash;
my %url_hash;
# 取得したpostのidの配列
my @post_id;
#idとタグの配列
my @id_tags;
#ソート用
my %xml_data;
#nbスコアのハッシュ
my %nb_score;
my $like_num=0;
my ($post_tag, $post_url, $sorted_id) = &post_fetch($post);
my @sort_id = @$sorted_id;
#idとタグをタブ区切りでリストに入れる
for(my $i=0;$i<=$post;$i++ ){
push(@id_tags,$post_id[$i]."\t".$post_tag->{$post_id[$i]});
}
# 好み順のIDが返される
my ($nb_result_id, $like_sheets) = &nb(@id_tags);
my @result_id = @$nb_result_id;
print("Likeに $like_sheets 枚分類されました。\n");
print("何枚取得しますか?:");
my $fetch_like = <STDIN>;
chomp($fetch_like);
if($post !~ /^[1-9]+/){
die "入力は半角の正の整数のみ可能です。\n";
}elsif($fetch_like>$like_sheets){
die "$like_sheets枚より少ない枚数を入力してください。\n";
}
if($sorting eq "s"){
# スコア順でソートしたIDを利用する
@result_id = @sort_id;
}
# 上位n件のurlから画像に日付の名前をつけて保存
#日時の取得
$ENV{'TZ'} = "JST-9";
my ($sec,$min,$hour,$mday,$month,$year,$wday,$stime) = localtime;
$year -= 100;
$month += 1;
my $date = $year.sprintf("%02d%02d_",$month,$mday);
#新規フォルダの名前設定
if($fetch_tags eq ''){
$fetch_tags = "All";
}
my $dir_name = ("$fetch_tags\_$date");
if((-d $dir_name."1")){
for(my $i=2;$i<1000;$i++){
if((-d ($dir_name.$i))==0){
$dir_name = ($dir_name.$i);
mkdir $dir_name;
last;
}
}
}else{
$dir_name = ($dir_name."1");
mkdir ($dir_name);
}
mkdir $dir_name;
for(my $k=0; $k<$fetch_like ;$k++){
my $url = $post_url->{$result_id[$k]};
my $html =get($url);
my $prefix = get_prefix($url);
open(OUT, "> ./$dir_name/$result_id[$k]$prefix");
binmode OUT;
print OUT "$html";
close(OUT);
my $count = $k+1;
print "Acquisition of the No\.$count sheet has been completed.\n";
}
# 拡張子を取得
sub get_prefix{
my $url = shift;
my $start = rindex($url,".");
my $end = length($url);
my $str = substr($url, $start, $end - $start);
return($str);
}
#postを取得する
sub post_fetch{
#スコア順に並び替えたid
my @sort_id;
my $post_num= int($_[0]/75)+1 ; #1ページあたり75post
if(($_[0] % 75)==0){
$post_num -= 1;
}
for(my $i=1;$i <= $post_num; $i++){
#2page取得毎に1秒スリープ
if($i % 2 == 0){
sleep 1;
my $progress = int(($i * 75/$_[0])*100);
if($progress<=100){
print("wait...$progress\% \n");
}
}
my $url = ""."$i";
my $res = get($url);
my $xml = XML::Simple->new(keyattr=>{'post'});
#XML構造を$dataに追加
my $data = $xml->XMLin($res);
#75で割った余り分の処理
my $rest=75;
if($i == $post_num){
if($_[0] % 75 != 0){
$rest = $_[0] % 75;
}
}
for(my $j=0;$j<$rest ;$j++){
my $tags = $data->{'post'}[$j]->{'tags'};
my $id = $data->{'post'}[$j]->{'id'};
my $score = $data->{'post'}[$j]->{'score'};
if($tags =~ /$_[1]|\n/){
my $thum = $data->{'post'}[$j]->{'sample_url'};
#タグとurlをそれぞれIDをkeyとするハッシュに入れる
$tags_hash{$id} = "$tags";
$url_hash{$id} = "$thum";
$xml_data{$id} = [$score,$thum];
push(@post_id,$id);
}else{
#何もしない
}
}
}
my @sort_id = &score_sort(%xml_data);
# print join("\n",@get_id);
return(\%tags_hash,\%url_hash, \@sort_id);
}
sub nb{
# 出現確率ファイルを読み込む
open(LIKE, "Likeprob.txt") or die "$!";
open(DISLIKE, "Dislikeprob.txt") or die "$!";
my $like;
my $dislike;
my %like_hash = 0;
my %dislike_hash = 0;
while(<LIKE>){
chomp;
my @list = split(/\t/);
$like_hash{$list[0]} = $list[1];
}
close(LIKE);
while(<DISLIKE>){
chomp;
my @list = split(/\t/);
$dislike_hash{$list[0]} = $list[1];
}
close(DISLIKE);
# LIKE/DISLIKE判定後の配列
#my %judge;
#my %judge_test;
# 教師データにはLikeが147個、Dislikeが96個存在する
my $like_num = 147;
my $dislike_num = 140;
my $total = $like_num + $dislike_num;
# 分類後のidのリスト
my @nb_id;
my $like_sheets = 0;
for(my $i=0; $i <= $#_ ; $i++){
my $like = log($like_num/$total);
my $dislike = log($dislike_num/$total);
my @list = split(/\t/,$_[$i]);
my @taglist = split(/ /, @list[1]);
for(my $i=0; $i<$#taglist; $i++){
if(defined($like_hash{$taglist[$i]})){
$like += log($like_hash{$taglist[$i]});
}else{
$like += log(0.001);
}
if(defined($dislike_hash{$taglist[$i]})){
$dislike += log($dislike_hash{$taglist[$i]});
}else{
$dislike += log(0.001);
}
}
# Like/Dislike判定
if($like > $dislike){
# print "$like $dislike Like\n";
# $judge{@list[0]} = "Like";
push(@nb_id, @list[0]);
# Like-DislikeのスコアをIDをキーとするハッシュに入れる
my $result = $like - $dislike;
$nb_score{@list[0]} = [$result];
$like_sheets++;
}elsif($dislike > $like){
# print "$like $dislike Dislike\n";
# $judge{@list[0]} = "Dislike";
}else{
# print "$like $dislike same\n";
}
$like = 0;
$dislike = 0;
}
my @nb_sort_id = &score_sort(%nb_score);
return(\@nb_sort_id, $like_sheets);
close(IN);
}
# スコア順で並び替え
sub score_sort{
my %test = @_;
my @idsort;
for my $key(sort {$test{$b}[0] <=> $test{$a}[0] || $a cmp $b}keys %test){
push(@idsort, $key);
}
return(@idsort);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment