sfan5/parse.js

## parse.js
// Go to https://twitter.com/favorites and scoll all the way until there are no more fav'd @catgirls_bot tweets
// maybe like this:
//   window.temp=function(){window.scrollTo(0,document.body.scrollHeight);setTimeout(window.temp,750)};window.temp()
//   window.temp=function(){}; // stop like this
var _q = document.getElementById('stream-items-id');
var _i;
var _o = [];
for(_i = 0; _i < _q.children.length; _i++) {
	var _e = _q.children[_i];
	if(_e.children[0].getAttribute('data-screen-name') == 'catgirls_bot') {
		var _a = _e.children[0].getAttribute('data-tweet-id');
		_e = _e.children[0].children[1];
		var _l = _e.children[1].children[0].children[0].getAttribute('data-expanded-url');
		var _p = _e.children[2].children[0].children[0].children[0].getAttribute('data-image-url') + ":large";
		_o.push({id: _a, link: _l, picture: _p});
	}
}
var _w = window.open();
if(_w) {
	_w.document.open();
	_w.document.write('<h1>Copy this and save it into <i>favs.json</i></h1>');
	_w.document.write('<pre>');
	_w.document.write(JSON.stringify(_o, null, '  '));
	_w.document.write('</pre>');
	_w.document.close();
}

## pixivdl.py
#!/usr/bin/env python3
import sys
import urllib.request
import re
import getopt

##################
PHPSESSID = "your_phpsessid_here"
##################

def gopt(opts, n):
	v = None
	for opt in opts:
		if opt[0] == n:
			v = opt[1]
	return v

r_image = re.compile(r'<img alt="[^"]+" width="[0-9]+" height="[0-9]+" data-src="([a-z0-9:/.\-_]+)" class="original-image">')
r_manga = re.compile(r'<a href="member_illust\.php\?mode=manga&amp;illust_id=[0-9]+"')
r_mangaimage = re.compile(r'<img src="([a-z0-9:/.\-_]+)"')

try:
	opts, args = getopt.getopt(sys.argv[1:], "o:q", ['no-manga'])
except getopt.GetoptError as e:
	print(str(e))
	exit(1)

if len(args) < 1:
	print("Usage: %s [-q] [--no-manga] [-o FILENAME] <pixiv ID> [2nd argument]" % sys.argv[0])
	print("The 2nd argument is usually the page number.")
	exit(1)

def urlopen(url, headers):
	h = {
		"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
		"Cookie": "PHPSESSID=%s; p_ab_id=4; a_type=0" % PHPSESSID
	}
	h.update(headers)
	r = urllib.request.Request(url, headers=h)
	return urllib.request.urlopen(r)

pixiv_id = int(args[0])
arg2 = "" if len(args) < 2 else args[1].strip()
page = "http://www.pixiv.net/member_illust.php?mode=medium&illust_id=%d" % pixiv_id

r = urlopen(page, {})
data = r.read().decode("utf-8", "ignore")
r.close()

if r_manga.search(data): # Manga
	if ('--no-manga', '') in opts:
		if not ('-q', '') in opts:
			print("Illustration is a manga and --no-manga given, exiting.")
		exit(1)
	if arg2 == "":
		if not ('-q', '') in opts:
			print("Illustration is a manga but no page number given, downloading first page by default")
			print("Set the page number by giving the script a second argument")
		page_no = 0
	else:
		page_no = int(arg2) - 1
	page = "http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=%d&page=%d" % (pixiv_id, page_no)
	r = urlopen(page, {})
	data = r.read().decode("utf-8", "ignore")
	r.close()
	m = r_mangaimage.search(data)
	if not m:
		print("Couldn't find URL")
		exit(1)
	imgurl = m.group(1)
else: # Single image
	m = r_image.search(data)
	if not m:
		print("Couldn't find URL")
		exit(1)
	imgurl = m.group(1)

if not ('-q', '') in opts:
	print("Downloading %s..." % imgurl)
filename = imgurl.split("/")[-1]
if gopt(opts, '-o'):
	filename = gopt(opts, '-o') + "." + filename.split(".")[-1]
f = open(filename, "wb")
r = urlopen(imgurl, {"Referer": page})

while True:
	data = r.read(256 * 1024) # 256 KiB
	if not data:
		break
	f.write(data)

r.close()
f.close()

if not ('-q', '') in opts:
	print("Saved work %d as %s" % (pixiv_id, filename))

## process.py
#!/usr/bin/env python3
import json

import urllib
import urllib.request
import re

import subprocess

import multiprocessing
import time

infile = "favs.json"
outdir = "/tmp/twpics/"
nworkers_twit = 4
nworkers_orig = 5

r_ext = re.compile(r'\.([a-z]+):large$')
r_pixiv = re.compile(r'^http:\/\/www\.pixiv\.net\/member_illust\.php\?mode=medium&illust_id=([0-9]+)$')
r_danbooru = re.compile(r'^https?:\/\/danbooru\.donmai\.us\/posts\/[0-9]+$')
r_yandere = re.compile(r'^https?:\/\/yande\.re\/post\/show\/[0-9]+$')
r_yandere_orig = re.compile(r'<a class="original-file-changed" id="highres" href="https:\/\/files\.yande\.re\/image\/([0-9a-z]{32})\/[^"]+\.([a-z]{,3})">')

def urlopen(url, headers={}):
	h = {}
	h["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"
	h.update(headers)
	r = urllib.request.Request(url, headers=h)
	return urllib.request.urlopen(r)

def fdcopy(fp1, fp2):
	while True:
		data = fp1.read(1024 * 1024)
		if not data:
			return
		fp2.write(data)

def download_original(link, filebase):
	if r_pixiv.search(link):
		pixiv_id = r_pixiv.search(link).group(1)
		p = subprocess.Popen(["/path/to/pixivdl.py", "-q", "--no-manga", "-o", filebase, pixiv_id])
		p.wait()
		return (p.returncode == 0)
	elif r_danbooru.search(link):
		s = urlopen(link + ".json")
		data = json.loads(s.read().decode("utf-8", "ignore"))
		s.close()
		if "file_url" not in data: # deleted image
			return False
		link = data["file_url"]
		s = urlopen("http://danbooru.donmai.us" + link)
		f = open(filebase + "." + link.split(".")[-1], "wb")
		fdcopy(s, f)
		s.close()
		f.close()
		return True
	elif r_yandere.search(link):
		s = urlopen(link)
		data = s.read().decode("utf-8", "ignore")
		s.close()
		m = r_yandere_orig.search(data)
		if not m:
			return False
		s = urlopen("https://files.yande.re/image/%s/a.%s" % m.groups())
		f = open(filebase + "." + m.group(2), "wb")
		fdcopy(s, f)
		s.close()
		f.close()
		return True
	else:
		return False

def worker_twit(queue):
	# Downloads twitter image
	for e in iter(queue.get, "STOP"):
		print("worker_twit: " + e["id"])
		picext = r_ext.search(e["picture"]).group(1)
		s = urlopen(e["picture"].replace(":large", ":orig"))
		f = open(outdir + e["id"] + "." + picext, "wb")
		fdcopy(s, f)
		s.close()
		f.close()

def worker_orig(queue):
	# Tries to download orignal image, otherwise notes URL in .txt file
	for e in iter(queue.get, "STOP"):
		print("worker_orig: " + e["id"])
		if not download_original(e["link"], outdir + e["id"] + "_o"):
			f = open(outdir + e["id"] + "_o.txt", "w")
			f.write(e["link"])
			f.write("\n")
			f.close()

f = open(infile, "r")
data = json.load(f)
f.close()

queue_twit = multiprocessing.SimpleQueue()
queue_orig = multiprocessing.SimpleQueue()

for i in range(nworkers_twit):
	multiprocessing.Process(target=worker_twit, args=(queue_twit, )).start()
for i in range(nworkers_orig):
	multiprocessing.Process(target=worker_orig, args=(queue_orig, )).start()

for e in data:
	queue_twit.put(e)
	if not e['link'] is None:
		queue_orig.put(e)

for i in range(nworkers_twit):
	queue_twit.put("STOP")
for i in range(nworkers_orig):
	queue_orig.put("STOP")
	// Go to https://twitter.com/favorites and scoll all the way until there are no more fav'd @catgirls_bot tweets
	// maybe like this:
	// window.temp=function(){window.scrollTo(0,document.body.scrollHeight);setTimeout(window.temp,750)};window.temp()
	// window.temp=function(){}; // stop like this
	var _q = document.getElementById('stream-items-id');
	var _i;
	var _o = [];
	for(_i = 0; _i < _q.children.length; _i++) {
	var _e = _q.children[_i];
	if(_e.children[0].getAttribute('data-screen-name') == 'catgirls_bot') {
	var _a = _e.children[0].getAttribute('data-tweet-id');
	_e = _e.children[0].children[1];
	var _l = _e.children[1].children[0].children[0].getAttribute('data-expanded-url');
	var _p = _e.children[2].children[0].children[0].children[0].getAttribute('data-image-url') + ":large";
	_o.push({id: _a, link: _l, picture: _p});
	}
	}
	var _w = window.open();
	if(_w) {
	_w.document.open();
	_w.document.write('<h1>Copy this and save it into <i>favs.json</i></h1>');
	_w.document.write('<pre>');
	_w.document.write(JSON.stringify(_o, null, ' '));
	_w.document.write('</pre>');
	_w.document.close();
	}
	#!/usr/bin/env python3
	import sys
	import urllib.request
	import re
	import getopt

	##################
	PHPSESSID = "your_phpsessid_here"
	##################

	def gopt(opts, n):
	v = None
	for opt in opts:
	if opt[0] == n:
	v = opt[1]
	return v

	r_image = re.compile(r'<img alt="[^"]+" width="[0-9]+" height="[0-9]+" data-src="([a-z0-9:/.\-_]+)" class="original-image">')
	r_manga = re.compile(r'<a href="member_illust\.php\?mode=manga&illust_id=[0-9]+"')
	r_mangaimage = re.compile(r'<img src="([a-z0-9:/.\-_]+)"')

	try:
	opts, args = getopt.getopt(sys.argv[1:], "o:q", ['no-manga'])
	except getopt.GetoptError as e:
	print(str(e))
	exit(1)

	if len(args) < 1:
	print("Usage: %s [-q] [--no-manga] [-o FILENAME] <pixiv ID> [2nd argument]" % sys.argv[0])
	print("The 2nd argument is usually the page number.")
	exit(1)

	def urlopen(url, headers):
	h = {
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
	"Cookie": "PHPSESSID=%s; p_ab_id=4; a_type=0" % PHPSESSID
	}
	h.update(headers)
	r = urllib.request.Request(url, headers=h)
	return urllib.request.urlopen(r)

	pixiv_id = int(args[0])
	arg2 = "" if len(args) < 2 else args[1].strip()
	page = "http://www.pixiv.net/member_illust.php?mode=medium&illust_id=%d" % pixiv_id

	r = urlopen(page, {})
	data = r.read().decode("utf-8", "ignore")
	r.close()

	if r_manga.search(data): # Manga
	if ('--no-manga', '') in opts:
	if not ('-q', '') in opts:
	print("Illustration is a manga and --no-manga given, exiting.")
	exit(1)
	if arg2 == "":
	if not ('-q', '') in opts:
	print("Illustration is a manga but no page number given, downloading first page by default")
	print("Set the page number by giving the script a second argument")
	page_no = 0
	else:
	page_no = int(arg2) - 1
	page = "http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=%d&page=%d" % (pixiv_id, page_no)
	r = urlopen(page, {})
	data = r.read().decode("utf-8", "ignore")
	r.close()
	m = r_mangaimage.search(data)
	if not m:
	print("Couldn't find URL")
	exit(1)
	imgurl = m.group(1)
	else: # Single image
	m = r_image.search(data)
	if not m:
	print("Couldn't find URL")
	exit(1)
	imgurl = m.group(1)

	if not ('-q', '') in opts:
	print("Downloading %s..." % imgurl)
	filename = imgurl.split("/")[-1]
	if gopt(opts, '-o'):
	filename = gopt(opts, '-o') + "." + filename.split(".")[-1]
	f = open(filename, "wb")
	r = urlopen(imgurl, {"Referer": page})

	while True:
	data = r.read(256 * 1024) # 256 KiB
	if not data:
	break
	f.write(data)

	r.close()
	f.close()

	if not ('-q', '') in opts:
	print("Saved work %d as %s" % (pixiv_id, filename))
	#!/usr/bin/env python3
	import json

	import urllib
	import urllib.request
	import re

	import subprocess

	import multiprocessing
	import time

	infile = "favs.json"
	outdir = "/tmp/twpics/"
	nworkers_twit = 4
	nworkers_orig = 5

	r_ext = re.compile(r'\.([a-z]+):large$')
	r_pixiv = re.compile(r'^http:\/\/www\.pixiv\.net\/member_illust\.php\?mode=medium&illust_id=([0-9]+)$')
	r_danbooru = re.compile(r'^https?:\/\/danbooru\.donmai\.us\/posts\/[0-9]+$')
	r_yandere = re.compile(r'^https?:\/\/yande\.re\/post\/show\/[0-9]+$')
	r_yandere_orig = re.compile(r'<a class="original-file-changed" id="highres" href="https:\/\/files\.yande\.re\/image\/([0-9a-z]{32})\/[^"]+\.([a-z]{,3})">')

	def urlopen(url, headers={}):
	h = {}
	h["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"
	h.update(headers)
	r = urllib.request.Request(url, headers=h)
	return urllib.request.urlopen(r)

	def fdcopy(fp1, fp2):
	while True:
	data = fp1.read(1024 * 1024)
	if not data:
	return
	fp2.write(data)

	def download_original(link, filebase):
	if r_pixiv.search(link):
	pixiv_id = r_pixiv.search(link).group(1)
	p = subprocess.Popen(["/path/to/pixivdl.py", "-q", "--no-manga", "-o", filebase, pixiv_id])
	p.wait()
	return (p.returncode == 0)
	elif r_danbooru.search(link):
	s = urlopen(link + ".json")
	data = json.loads(s.read().decode("utf-8", "ignore"))
	s.close()
	if "file_url" not in data: # deleted image
	return False
	link = data["file_url"]
	s = urlopen("http://danbooru.donmai.us" + link)
	f = open(filebase + "." + link.split(".")[-1], "wb")
	fdcopy(s, f)
	s.close()
	f.close()
	return True
	elif r_yandere.search(link):
	s = urlopen(link)
	data = s.read().decode("utf-8", "ignore")
	s.close()
	m = r_yandere_orig.search(data)
	if not m:
	return False
	s = urlopen("https://files.yande.re/image/%s/a.%s" % m.groups())
	f = open(filebase + "." + m.group(2), "wb")
	fdcopy(s, f)
	s.close()
	f.close()
	return True
	else:
	return False

	def worker_twit(queue):
	# Downloads twitter image
	for e in iter(queue.get, "STOP"):
	print("worker_twit: " + e["id"])
	picext = r_ext.search(e["picture"]).group(1)
	s = urlopen(e["picture"].replace(":large", ":orig"))
	f = open(outdir + e["id"] + "." + picext, "wb")
	fdcopy(s, f)
	s.close()
	f.close()

	def worker_orig(queue):
	# Tries to download orignal image, otherwise notes URL in .txt file
	for e in iter(queue.get, "STOP"):
	print("worker_orig: " + e["id"])
	if not download_original(e["link"], outdir + e["id"] + "_o"):
	f = open(outdir + e["id"] + "_o.txt", "w")
	f.write(e["link"])
	f.write("\n")
	f.close()

	f = open(infile, "r")
	data = json.load(f)
	f.close()

	queue_twit = multiprocessing.SimpleQueue()
	queue_orig = multiprocessing.SimpleQueue()

	for i in range(nworkers_twit):
	multiprocessing.Process(target=worker_twit, args=(queue_twit, )).start()
	for i in range(nworkers_orig):
	multiprocessing.Process(target=worker_orig, args=(queue_orig, )).start()

	for e in data:
	queue_twit.put(e)
	if not e['link'] is None:
	queue_orig.put(e)

	for i in range(nworkers_twit):
	queue_twit.put("STOP")
	for i in range(nworkers_orig):
	queue_orig.put("STOP")