Skip to content

Instantly share code, notes, and snippets.

@ChronoMonochrome
Last active October 20, 2016 17:56
Show Gist options
  • Save ChronoMonochrome/96517538b83ec1872b140c1717ac37aa to your computer and use it in GitHub Desktop.
Save ChronoMonochrome/96517538b83ec1872b140c1717ac37aa to your computer and use it in GitHub Desktop.
Visual novels database parser
*.c
*.pyc
*.html
*.o
*.so
build
#!/usr/bin/env python
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
#import numpy as np
ext_modules = [Extension("vnparser",["vnparser.pyx"])]
setup(
name= 'Visual database parser class',
cmdclass = {'build_ext': build_ext},
include_dirs = ['/usr/local/lib/python2.7/dist-packages/'],
ext_modules = ext_modules)
from vnparser import *
PATH = "../html/v"
VN_MAX_ID = 40
BUFFER_SIZE = 10
v = VNParser(PATH, VN_MAX_ID, BUFFER_SIZE, 4, False, PATH)
v.genDB()
#v.genDB_singlethreaded()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import collections, sys, os
import MySQLdb
from bs4 import BeautifulSoup
from pprint import pprint
from multiprocessing import Pool, Lock, cpu_count
from copy_reg import pickle
from types import MethodType
_MUTEX = None
_PICKLE_METHOD_REGISTRED = False
def _pickle_method(method):
func_name = method.im_func.__name__
obj = method.im_self
cls = method.im_class
return _unpickle_method, (func_name, obj, cls)
def _unpickle_method(func_name, obj, cls):
for cls in cls.mro():
try:
func = cls.__dict__[func_name]
except KeyError:
pass
else:
break
return func.__get__(obj, cls)
if not _PICKLE_METHOD_REGISTRED:
pickle(MethodType, _pickle_method, _unpickle_method)
_PICKLE_METHOD_REGISTRED = True
def _MUTEX_init(mutex, out):
global _MUTEX
_MUTEX = mutex
try:
os.remove(out + "/novels.sql")
os.remove(out + "/novels_screenshots.sql")
except:
pass
class VNParser(object):
def __init__(self, path, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, output = None):
if not mysql_live_execution and not output:
raise Exception("error, either mysql_live_execution or output must be specified")
self.vn_max_id = vn_max_id
self.buffer_size = buffer_size
if not threads_number:
self.threads_number = cpu_count()
else:
self.threads_number = threads_number
self.mysql_live_execution = mysql_live_execution
self.path = path
self.output = output
"""if mysql_live_execution:
db = MySQLdb.connect(host='myanimespace.com', user='******', passwd='*******', db='myanimespace')
db.set_character_set('utf8')
cursor = db.cursor()
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
self._db = db
self._cursor = cursor"""
def __exit__(self):
pass
def getDictFromTable(self, table):
res = collections.OrderedDict()
for row in table:
tds = row.findAll("td")
if len(tds) == 2:
key = tds[0].text
value = tds[1]
all_hrefs = value.findAll("a")
if not all_hrefs:
value = value.text
else:
value = [a.get("href") for a in all_hrefs]
else:
desc = row.find("td", attrs={'class': 'vndesc'})
key = desc.find('h2').text
value = desc.find('p').text
res[key] = value
return res
def _process_multiple_args(self, args):
start, end, writeout_args = args
self.process(start, end, writeout_args)
def process(self, start, end, writeout_args = None):
novels = []
novels_screenshots = []
v_count = 0
for v_id in xrange(start, end):
try:
html = open("%s/%d.html" % (self.path, v_id), "rb").read()
print("open %d.html" % v_id)
except:
print("%d.html is not found, skipping..." % v_id)
continue
parsed_html = BeautifulSoup(html, "html.parser")
vndetails = parsed_html.body.find('div', attrs={'class':'vndetails'})
if not vndetails:
print("page %d.html doesn't have vndetails div, skipping..." % v_id)
continue
try:
screens = [img.get('src') for img in parsed_html.body.find('div', attrs={'class':'scr'}).findAll("img")]
except:
print("screens weren't found on page %d.html" % v_id)
screens = None
try:
img = vndetails.find("p").find("img")
except:
print("title img was not found on page %d.html" % v_id)
img = u'null'
novel = self.getDictFromTable(vndetails.findAll("tr"))
novel[u'v_id'] = v_id
novel[u'image'] = img
novels.append(novel)
screen_id = 0
for screen in screens:
novel_screenshot = collections.OrderedDict()
novel_screenshot[u'v_id'] = unicode(v_id)
novel_screenshot[u'screen_id'] = unicode(screen_id)
novel_screenshot[u'screenshots'] = screen
novels_screenshots.append(novel_screenshot)
screen_id += 1
"""if self.mysql_live_execution:
v_count += 1
if v_count > BUFFER_SIZE:
print("cur v_id %d" % (v_id))
self.mysqlQueries(novels)
novels.clear()
v_count = 0
self.db.commit()
# если еще остались
if (v_id == end) and len(novels):
q = self.mysqlQueries(novels)
#print("MySQL queries:");pprint(q)"""
if writeout_args:
filepath, id = writeout_args
def writeout(name, tables, tables_names_conversion):
s = self.mysqlQueries(name, tables, tables_names_conversion)
with _MUTEX:
f = open(filepath + "/" + name + ".sql", 'ab')
f.write(s)
f.close()
novels_names_conversion = collections.OrderedDict()
novels_names_conversion['v_id'] = u'v_id'
novels_names_conversion['romaji'] = u'Title'
novels_names_conversion['japanese'] = u'Original title'
novels_names_conversion['duration'] = u'Length'
novels_names_conversion['synopsis_en'] = u'Description'
novels_names_conversion['posters'] = u'image'
writeout("novels", novels, novels_names_conversion)
novels_screenshots_names_conversion = collections.OrderedDict()
novels_screenshots_names_conversion['v_id'] = u'v_id'
novels_screenshots_names_conversion['screen_id'] = u'screen_id'
novels_screenshots_names_conversion['screenshots'] = u'screenshots'
writeout("novels_screenshots", novels_screenshots, novels_screenshots_names_conversion)
#pprint(s)
def mysqlQueries(self, tables_name, tables, table_column_dict):
names, values = table_column_dict.keys(), table_column_dict.values()
def prepare(table, values):
s = u"(%s),\n" % ("'%s',\n" * (len(values) - 1) + "'%s'\n")
values = [unicode(table.get(i, u'null')).replace("'", "\\'") for i in values]
#print(s, values)
return s % tuple(values)
s = ("""INSERT INTO %s (%s) VALUES""" % (tables_name, "%s,\n" * (len(names) - 1) + '%s\n')) % (tuple(names))
if not self.mysql_live_execution:
for table in tables:
s += prepare(table, values)
s = s[:-2] + u";\n"
return s.encode("u8")
"""else:
for table in tables:
v_id, t = table
placeholders = (str(v_id), t.get(u'Title', 'null'), t.get(u'Original title', 'null'), t.get(u'Length', 'null'), t.get(u'Description', 'null'), t.get(u'image', 'null'))
# для определения sql с ошибкой
#f = open('mysql.txt', 'w')
#f.write(s)
#f.write(', '.join(placeholders))
#f.close()
self.cursor.execute(s, placeholders)"""
def genDB(self):
if not self.mysql_live_execution:
c = 0
start, end = 0, self.buffer_size
mutex = Lock()
pool = Pool(processes = self.threads_number, initializer = _MUTEX_init, initargs = (mutex, self.output))
while c <= self.vn_max_id:
args_map = []
for i in range(self.threads_number):
if end > self.vn_max_id:
end = self.vn_max_id
args_map.append((start + 1, end + 1, [self.output, c]))
start += self.buffer_size
end += self.buffer_size
c = end + 1
pool.map_async(self._process_multiple_args, args_map).get(9999999)
pool.close()
pool.join()
def genDB_singlethreaded(self):
# kept for debugging purposes only
_MUTEX_init(Lock(), self.output)
if not self.mysql_live_execution:
c = 0
start, end = 0, self.buffer_size
while c <= self.vn_max_id:
for i in range(self.threads_number):
if end > self.vn_max_id:
end = self.vn_max_id
self._process_multiple_args([start + 1, end + 1, [self.output, c]])
start += self.buffer_size
end += self.buffer_size
c = end + 1
if __name__ == '__main__':
PATH = "html/v"
VN_MAX_ID = 20137
BUFFER_SIZE = 250
# VNParser(path, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, output = PATH)
vnparser = VNParser(PATH, VN_MAX_ID, BUFFER_SIZE, 4, False, PATH)
vnparser.genDB()
#vnparser.genDB_singlethreaded()
# -*- coding: utf-8 -*-
import collections, sys, os
import MySQLdb
from bs4 import BeautifulSoup
from pprint import pprint
from multiprocessing import Pool, Lock, cpu_count
from copy_reg import pickle
from types import MethodType
_PICKLE = False
def _pickle_method(method):
func_name = method.im_func.__name__
obj = method.im_self
cls = method.im_class
return _unpickle_method, (func_name, obj, cls)
def _unpickle_method(func_name, obj, cls):
break_ = 0
for cls in cls.mro():
if break_: break
try:
func = cls.__dict__[func_name]
except KeyError:
pass
else:
break_ = 1
return func.__get__(obj, cls)
if not _PICKLE:
pickle(MethodType, _pickle_method, _unpickle_method)
_PICKLE = True
cdef class VNParser(object):
cdef public bytes path
cdef public bytes output
cdef public int vn_max_id
cdef public int buffer_size
cdef public int threads_number
cdef public int mysql_live_execution
cdef public object _cursor
cdef public object _db
cdef public object _mutex
def __init__(self, output, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, path = None):
if not mysql_live_execution and not path:
raise Exception("error, either mysql_live_execution or path must be specified")
self.vn_max_id = vn_max_id
self.buffer_size = buffer_size
if not threads_number:
self.threads_number = cpu_count()
else:
self.threads_number = threads_number
self.mysql_live_execution = mysql_live_execution
self.path = path
self.output = output
if mysql_live_execution:
db = MySQLdb.connect(host='myanimespace.com', user='******', passwd='*******', db='myanimespace')
db.set_character_set('utf8')
cursor = db.cursor()
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
self._db = db
self._cursor = cursor
cdef inline getDictFromTable(self, table):
res = collections.OrderedDict()
for row in table:
tds = row.findAll("td")
if len(tds) == 2:
key = tds[0].text
value = tds[1]
all_hrefs = value.findAll("a")
if not all_hrefs:
value = value.text
else:
value = [a.get("href") for a in all_hrefs]
else:
desc = row.find("td", attrs={'class': 'vndesc'})
key = desc.find('h2').text
value = desc.find('p').text
res[key] = value
return res
def _process_multiple_args(self, args):
start, end, writeout_args = args
self.process(start, end, writeout_args)
cdef inline process(self, start, end, writeout_args = None):
tables = collections.OrderedDict()
v_count = 0
for v_id in xrange(start, end):
try:
html = open("%s/%d.html" % (self.path, v_id), "rb").read()
print("open %d.html" % v_id)
except:
print("%d.html is not found, skipping..." % v_id)
continue
parsed_html = BeautifulSoup(html, "html.parser")
vndetails = parsed_html.body.find('div', attrs={'class':'vndetails'})
if not vndetails:
print("page %d.html doesn't have vndetails div, skipping..." % v_id)
continue
try:
screens = [img.get('src') for img in parsed_html.body.find('div', attrs={'class':'scr'}).findAll("img")]
except:
print("screens weren't found on page %d.html" % v_id)
screens = u'null'
try:
img = vndetails.find("p").find("img")
except:
#catch()
print("title img was not found on page %d.html" % v_id)
img = u'null'
table = self.getDictFromTable(vndetails.findAll("tr"))
table[u'image'] = img
table[u'screens'] = screens
tables[v_id] = table
if self.mysql_live_execution:
v_count += 1
if v_count > BUFFER_SIZE:
print("cur v_id %d" % (v_id))
self.mysqlQueries(tables)
tables.clear()
v_count = 0
self.db.commit()
# если еще остались
if (v_id == end) and len(tables):
q = self.mysqlQueries(tables)
#print("MySQL queries:");pprint(q)
if writeout_args:
filename, id = writeout_args
with self._mutex:
s = self.mysqlQueries(tables)
f = open(filename, 'ab')
f.write(s)
f.close()
#pprint(s)
cdef inline mysqlQueries(self, tables):
s = """
INSERT INTO
novels
(
id,
romaji,
japanese,
duration,
synopsis_en,
posters
)
VALUES
"""
if not self.mysql_live_execution:
def prepare(v_data, args):
s = u"""
(
'%s',
'%s',
'%s',
'%s',
'%s',
'%s',
),"""
args = [unicode(v_data.get(i, u'null')).replace("'", "\\'") for i in args]
return s % tuple(args)
for v_id in tables:
v_data = tables[v_id]
s += prepare(v_data,
[unicode(v_id), u'Title', u'Original title', u'Length', u'Description', u'image'])
s = s[:-1] + u";\n"
return s.encode("u8")
else:
for v_id in tables:
t = tables[v_id]
placeholders = (str(v_id), t.get(u'Title', 'null'), t.get(u'Original title', 'null'), t.get(u'Length', 'null'), t.get(u'Description', 'null'), t.get(u'image', 'null'))
# для определения sql с ошибкой
#f = open('mysql.txt', 'w')
#f.write(s)
#f.write(', '.join(placeholders))
#f.close()
self.cursor.execute(s, placeholders)
cdef _MUTEX_init(self, mutex, out):
self._mutex = mutex
try:
os.remove(out)
except:
pass
def genDB(self):
if not self.mysql_live_execution:
c = 0
start, end = 0, self.buffer_size
mutex = Lock()
pool = Pool(processes = self.threads_number, initializer = self._MUTEX_init, initargs = (self, mutex, self.output))
while c <= self.vn_max_id:
args_map = []
for i in range(self.threads_number):
if end > self.vn_max_id:
end = self.vn_max_id
args_map.append((start + 1, end + 1, [self.output, c]))
start += self.buffer_size
end += self.buffer_size
c = end + 1
pool.map_async(self._process_multiple_args, args_map).get(9999999)
pool.close()
pool.join()
else:
self.process(3184, self.vn_max_id);
self.db.commit()
self.db.close()
if __name__ == '__main__':
PATH = "html/v"
VN_MAX_ID = 20137
BUFFER_SIZE = 250
# VNParser(output, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, path = PATH)
vnparser = VNParser(PATH + "/sql_new.txt", VN_MAX_ID, BUFFER_SIZE, 4, False, PATH)
vnparser.genDB()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment