Skip to content

Instantly share code, notes, and snippets.

@jayleicn
Last active January 5, 2018 06:10
Show Gist options
  • Save jayleicn/bc20a6ebd04221410d855082e5c6e410 to your computer and use it in GitHub Desktop.
Save jayleicn/bc20a6ebd04221410d855082e5c6e410 to your computer and use it in GitHub Desktop.
start routine for pachong.rb
#!/bin/bash
mkdir -p download
mkdir -p fail
mkdir -p error
START=1
END=9500
STEP=500
rm -rf num_seq.txt
for i in $(seq $START $STEP $END); do
echo $i >> num_seq.txt
done
# change N_THREADS to the number of CPU cores in your machine
N_THREADS=12
cat num_seq.txt | xargs -n 1 -P $N_THREADS -I 'num' ruby pachong.rb num $STEP
rm -rf num_seq.txt
# linux version
URL = 'bangumi.tv/character/'
READY = []
Dir.glob('download/*').each do |f|
if f =~ /download\/(\d+)/
READY << $1.to_i
end
end
Dir.glob('fail/*').each do |f|
if f =~ /fail\/(\d+)/
READY << $1.to_i
end
end
Dir.glob('error/*').each do |f|
if f =~ /error\/(\d+)/
READY << $1.to_i
end
end
READY.uniq!
def download(i)
log = ''
fn = i.to_s
system "wget #{URL}#{fn}"
lines = []
if !FileTest.exist?(fn)
return
end
File.open(fn, 'r') do |f|
lines = f.readlines
end
find = false
lines.each do |l|
if l =~ /<title>(.+)<\/title>/
name, description = $1.split('|').collect { |e| e.strip }
log << "#{i}: #{name}, #{description}\n"
end
if l =~ /href="(.+)" class="cover thickbox"/
url = 'http:' + $1
url.slice!(/\?.+$/)
log << url + "\n"
system "wget #{url}"
system "rm #{fn}"
find = true
break
end
end
if !find
system "mv #{fn} fail"
log << "\n"
end
return log
end
i = ARGV[0].to_i
n = ARGV[1].to_i
log = ''
n.times do
log << download(i) if !READY.include?(i)
i += 1
end
system "mv *.jpg download"
File.open('pachong.txt', 'a') do |f|
f << log
end
@jayleicn
Copy link
Author

jayleicn commented Jan 3, 2018

@gxm11
Copy link

gxm11 commented Jan 5, 2018

line 33 should be return ''

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment