Skip to content

Instantly share code, notes, and snippets.

@jinyu121
Created March 8, 2018 08:28
Show Gist options
  • Save jinyu121/c4f00d1f398021042013870e0e25d284 to your computer and use it in GitHub Desktop.
Save jinyu121/c4f00d1f398021042013870e0e25d284 to your computer and use it in GitHub Desktop.
Download and cut comic face
# -*- coding: utf-8 -*-
import time
import redis
import os
import cv2
from threading import Thread
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
def make_key(url, tiny=False):
ans = os.path.split(url)[-1]
if tiny:
ans = os.path.splitext(ans)[0]
return ans
class Detector:
def __init__(self, basedir, cascade_file="lbpcascade_animeface.xml"):
assert os.path.isfile(cascade_file), "{} not found".format(cascade_file)
self.cascade_file = cascade_file
self.basedir = basedir
self.detector = cv2.CascadeClassifier(cascade_file)
os.makedirs(self.basedir, exist_ok=True)
def __call__(self, filename, *args, **kwargs):
image = cv2.imread(filename)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.equalizeHist(gray)
faces = self.detector.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48))
for i, (x, y, w, h) in enumerate(faces):
face = image[y: y + h, x:x + w, :]
face = cv2.resize(face, (96, 96))
save_filename = '{}-{}.jpg'.format(os.path.basename(filename).split('.')[0], i)
cv2.imwrite(os.path.join(self.basedir, save_filename), face)
class RedisDatabase:
def __init__(self, name, namespace='queue', **redis_kwargs):
"""The default connection parameters are: host='localhost', port=6379, db=0"""
self._db = redis.StrictRedis(**redis_kwargs)
self._key = '{}:{}'.format(namespace, name)
self._key_used = '{}:{}__used'.format(namespace, name)
def __len__(self):
return self._db.llen(self._key)
def __next__(self):
item = self._db.blpop(self._key)[1].decode()
self._db.sadd(self._key_used, item)
return item
def __iter__(self):
return self
def __setitem__(self, key, value):
k = "{}_{}".format(self._key, key)
self._db.set(k, value)
def __getitem__(self, key):
k = "{}_{}".format(self._key, key)
return self._db.get(k)
def is_empty(self):
return self.__len__() == 0
def push(self, item):
key = make_key(item, tiny=True)
if not self._db.sismember(self._key_used, key):
self._db.rpush(self._key, item)
def pop(self):
return self.__next__()
class ImageFinder(Thread):
def __init__(self, database, page_start=1, sleep=1):
super(ImageFinder, self).__init__()
self.base_url = "http://konachan.net/post?page={}&tags="
self._db = database
self._sleep = sleep
if self._db['page'] is None:
self._db['page'] = page_start
def _get_page_num(self):
return int(self._db['page'])
def _inc_page_num(self):
page = self._get_page_num() + 1
self._db['page'] = page
def _parse_page(self):
try:
page = self._get_page_num()
tqdm.write("Parse page {}".format(page))
url = self.base_url.format(page)
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
for img in soup.find_all('img', class_="preview"):
url = 'http:' + img['src']
self._db.push(url)
tqdm.write("Add URL: {}".format(url))
self._inc_page_num()
except Exception as e:
print(e)
def run(self):
while True:
if len(self._db) < 100:
self._parse_page()
time.sleep(self._sleep)
class DataSaver(Thread):
def __init__(self, database, base_dir, detector, sleep=1):
super(DataSaver, self).__init__()
self.base_dir = base_dir
self._db = database
self._sleep = sleep
self.detector = detector
if not os.path.exists(base_dir):
os.makedirs(base_dir, mode=0o755, exist_ok=True)
def _download_file(self, url, filename):
if os.path.exists(filename):
tqdm.write('File {} exists, skip'.format(filename))
return
tqdm.write('Downloading {}'.format(url))
r = requests.get(url, stream=True, timeout=60)
r.raise_for_status()
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
self.detector(filename) # Detect face and save
def _delete_file(self, filename):
if os.path.exists(filename):
os.remove(filename)
def run(self):
for url in self._db:
filename = os.path.join(self.base_dir, make_key(url))
try:
self._download_file(url, filename)
except Exception as e:
print(e)
self._delete_file(filename)
time.sleep(self._sleep)
def main(args):
queue = RedisDatabase(args['redis_pre'])
image_finder = ImageFinder(queue, sleep=args['finder_sleep'])
image_finder.start()
for _ in range(args['num_walkers']):
detector = Detector(args["cut_dir"])
ds = DataSaver(queue, args['save_dir'], detector, sleep=args['walker_sleep'])
ds.start()
if '__main__' == __name__:
config = {
"redis_pre": 'icon_queue',
"save_dir": "download",
"cut_dir": "data/face",
"num_walkers": 5,
"finder_sleep": 1.,
"walker_sleep": 0.1
}
main(config)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment