Last active
October 20, 2016 17:56
-
-
Save ChronoMonochrome/96517538b83ec1872b140c1717ac37aa to your computer and use it in GitHub Desktop.
Visual novels database parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.c | |
*.pyc | |
*.html | |
*.o | |
*.so | |
build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
try: | |
from setuptools import setup | |
from setuptools import Extension | |
except ImportError: | |
from distutils.core import setup | |
from distutils.extension import Extension | |
from Cython.Distutils import build_ext | |
#import numpy as np | |
ext_modules = [Extension("vnparser",["vnparser.pyx"])] | |
setup( | |
name= 'Visual database parser class', | |
cmdclass = {'build_ext': build_ext}, | |
include_dirs = ['/usr/local/lib/python2.7/dist-packages/'], | |
ext_modules = ext_modules) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from vnparser import * | |
PATH = "../html/v" | |
VN_MAX_ID = 40 | |
BUFFER_SIZE = 10 | |
v = VNParser(PATH, VN_MAX_ID, BUFFER_SIZE, 4, False, PATH) | |
v.genDB() | |
#v.genDB_singlethreaded() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import collections, sys, os | |
import MySQLdb | |
from bs4 import BeautifulSoup | |
from pprint import pprint | |
from multiprocessing import Pool, Lock, cpu_count | |
from copy_reg import pickle | |
from types import MethodType | |
_MUTEX = None | |
_PICKLE_METHOD_REGISTRED = False | |
def _pickle_method(method): | |
func_name = method.im_func.__name__ | |
obj = method.im_self | |
cls = method.im_class | |
return _unpickle_method, (func_name, obj, cls) | |
def _unpickle_method(func_name, obj, cls): | |
for cls in cls.mro(): | |
try: | |
func = cls.__dict__[func_name] | |
except KeyError: | |
pass | |
else: | |
break | |
return func.__get__(obj, cls) | |
if not _PICKLE_METHOD_REGISTRED: | |
pickle(MethodType, _pickle_method, _unpickle_method) | |
_PICKLE_METHOD_REGISTRED = True | |
def _MUTEX_init(mutex, out): | |
global _MUTEX | |
_MUTEX = mutex | |
try: | |
os.remove(out + "/novels.sql") | |
os.remove(out + "/novels_screenshots.sql") | |
except: | |
pass | |
class VNParser(object): | |
def __init__(self, path, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, output = None): | |
if not mysql_live_execution and not output: | |
raise Exception("error, either mysql_live_execution or output must be specified") | |
self.vn_max_id = vn_max_id | |
self.buffer_size = buffer_size | |
if not threads_number: | |
self.threads_number = cpu_count() | |
else: | |
self.threads_number = threads_number | |
self.mysql_live_execution = mysql_live_execution | |
self.path = path | |
self.output = output | |
"""if mysql_live_execution: | |
db = MySQLdb.connect(host='myanimespace.com', user='******', passwd='*******', db='myanimespace') | |
db.set_character_set('utf8') | |
cursor = db.cursor() | |
cursor.execute('SET NAMES utf8;') | |
cursor.execute('SET CHARACTER SET utf8;') | |
cursor.execute('SET character_set_connection=utf8;') | |
self._db = db | |
self._cursor = cursor""" | |
def __exit__(self): | |
pass | |
def getDictFromTable(self, table): | |
res = collections.OrderedDict() | |
for row in table: | |
tds = row.findAll("td") | |
if len(tds) == 2: | |
key = tds[0].text | |
value = tds[1] | |
all_hrefs = value.findAll("a") | |
if not all_hrefs: | |
value = value.text | |
else: | |
value = [a.get("href") for a in all_hrefs] | |
else: | |
desc = row.find("td", attrs={'class': 'vndesc'}) | |
key = desc.find('h2').text | |
value = desc.find('p').text | |
res[key] = value | |
return res | |
def _process_multiple_args(self, args): | |
start, end, writeout_args = args | |
self.process(start, end, writeout_args) | |
def process(self, start, end, writeout_args = None): | |
novels = [] | |
novels_screenshots = [] | |
v_count = 0 | |
for v_id in xrange(start, end): | |
try: | |
html = open("%s/%d.html" % (self.path, v_id), "rb").read() | |
print("open %d.html" % v_id) | |
except: | |
print("%d.html is not found, skipping..." % v_id) | |
continue | |
parsed_html = BeautifulSoup(html, "html.parser") | |
vndetails = parsed_html.body.find('div', attrs={'class':'vndetails'}) | |
if not vndetails: | |
print("page %d.html doesn't have vndetails div, skipping..." % v_id) | |
continue | |
try: | |
screens = [img.get('src') for img in parsed_html.body.find('div', attrs={'class':'scr'}).findAll("img")] | |
except: | |
print("screens weren't found on page %d.html" % v_id) | |
screens = None | |
try: | |
img = vndetails.find("p").find("img") | |
except: | |
print("title img was not found on page %d.html" % v_id) | |
img = u'null' | |
novel = self.getDictFromTable(vndetails.findAll("tr")) | |
novel[u'v_id'] = v_id | |
novel[u'image'] = img | |
novels.append(novel) | |
screen_id = 0 | |
for screen in screens: | |
novel_screenshot = collections.OrderedDict() | |
novel_screenshot[u'v_id'] = unicode(v_id) | |
novel_screenshot[u'screen_id'] = unicode(screen_id) | |
novel_screenshot[u'screenshots'] = screen | |
novels_screenshots.append(novel_screenshot) | |
screen_id += 1 | |
"""if self.mysql_live_execution: | |
v_count += 1 | |
if v_count > BUFFER_SIZE: | |
print("cur v_id %d" % (v_id)) | |
self.mysqlQueries(novels) | |
novels.clear() | |
v_count = 0 | |
self.db.commit() | |
# если еще остались | |
if (v_id == end) and len(novels): | |
q = self.mysqlQueries(novels) | |
#print("MySQL queries:");pprint(q)""" | |
if writeout_args: | |
filepath, id = writeout_args | |
def writeout(name, tables, tables_names_conversion): | |
s = self.mysqlQueries(name, tables, tables_names_conversion) | |
with _MUTEX: | |
f = open(filepath + "/" + name + ".sql", 'ab') | |
f.write(s) | |
f.close() | |
novels_names_conversion = collections.OrderedDict() | |
novels_names_conversion['v_id'] = u'v_id' | |
novels_names_conversion['romaji'] = u'Title' | |
novels_names_conversion['japanese'] = u'Original title' | |
novels_names_conversion['duration'] = u'Length' | |
novels_names_conversion['synopsis_en'] = u'Description' | |
novels_names_conversion['posters'] = u'image' | |
writeout("novels", novels, novels_names_conversion) | |
novels_screenshots_names_conversion = collections.OrderedDict() | |
novels_screenshots_names_conversion['v_id'] = u'v_id' | |
novels_screenshots_names_conversion['screen_id'] = u'screen_id' | |
novels_screenshots_names_conversion['screenshots'] = u'screenshots' | |
writeout("novels_screenshots", novels_screenshots, novels_screenshots_names_conversion) | |
#pprint(s) | |
def mysqlQueries(self, tables_name, tables, table_column_dict): | |
names, values = table_column_dict.keys(), table_column_dict.values() | |
def prepare(table, values): | |
s = u"(%s),\n" % ("'%s',\n" * (len(values) - 1) + "'%s'\n") | |
values = [unicode(table.get(i, u'null')).replace("'", "\\'") for i in values] | |
#print(s, values) | |
return s % tuple(values) | |
s = ("""INSERT INTO %s (%s) VALUES""" % (tables_name, "%s,\n" * (len(names) - 1) + '%s\n')) % (tuple(names)) | |
if not self.mysql_live_execution: | |
for table in tables: | |
s += prepare(table, values) | |
s = s[:-2] + u";\n" | |
return s.encode("u8") | |
"""else: | |
for table in tables: | |
v_id, t = table | |
placeholders = (str(v_id), t.get(u'Title', 'null'), t.get(u'Original title', 'null'), t.get(u'Length', 'null'), t.get(u'Description', 'null'), t.get(u'image', 'null')) | |
# для определения sql с ошибкой | |
#f = open('mysql.txt', 'w') | |
#f.write(s) | |
#f.write(', '.join(placeholders)) | |
#f.close() | |
self.cursor.execute(s, placeholders)""" | |
def genDB(self): | |
if not self.mysql_live_execution: | |
c = 0 | |
start, end = 0, self.buffer_size | |
mutex = Lock() | |
pool = Pool(processes = self.threads_number, initializer = _MUTEX_init, initargs = (mutex, self.output)) | |
while c <= self.vn_max_id: | |
args_map = [] | |
for i in range(self.threads_number): | |
if end > self.vn_max_id: | |
end = self.vn_max_id | |
args_map.append((start + 1, end + 1, [self.output, c])) | |
start += self.buffer_size | |
end += self.buffer_size | |
c = end + 1 | |
pool.map_async(self._process_multiple_args, args_map).get(9999999) | |
pool.close() | |
pool.join() | |
def genDB_singlethreaded(self): | |
# kept for debugging purposes only | |
_MUTEX_init(Lock(), self.output) | |
if not self.mysql_live_execution: | |
c = 0 | |
start, end = 0, self.buffer_size | |
while c <= self.vn_max_id: | |
for i in range(self.threads_number): | |
if end > self.vn_max_id: | |
end = self.vn_max_id | |
self._process_multiple_args([start + 1, end + 1, [self.output, c]]) | |
start += self.buffer_size | |
end += self.buffer_size | |
c = end + 1 | |
if __name__ == '__main__': | |
PATH = "html/v" | |
VN_MAX_ID = 20137 | |
BUFFER_SIZE = 250 | |
# VNParser(path, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, output = PATH) | |
vnparser = VNParser(PATH, VN_MAX_ID, BUFFER_SIZE, 4, False, PATH) | |
vnparser.genDB() | |
#vnparser.genDB_singlethreaded() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import collections, sys, os | |
import MySQLdb | |
from bs4 import BeautifulSoup | |
from pprint import pprint | |
from multiprocessing import Pool, Lock, cpu_count | |
from copy_reg import pickle | |
from types import MethodType | |
_PICKLE = False | |
def _pickle_method(method): | |
func_name = method.im_func.__name__ | |
obj = method.im_self | |
cls = method.im_class | |
return _unpickle_method, (func_name, obj, cls) | |
def _unpickle_method(func_name, obj, cls): | |
break_ = 0 | |
for cls in cls.mro(): | |
if break_: break | |
try: | |
func = cls.__dict__[func_name] | |
except KeyError: | |
pass | |
else: | |
break_ = 1 | |
return func.__get__(obj, cls) | |
if not _PICKLE: | |
pickle(MethodType, _pickle_method, _unpickle_method) | |
_PICKLE = True | |
cdef class VNParser(object): | |
cdef public bytes path | |
cdef public bytes output | |
cdef public int vn_max_id | |
cdef public int buffer_size | |
cdef public int threads_number | |
cdef public int mysql_live_execution | |
cdef public object _cursor | |
cdef public object _db | |
cdef public object _mutex | |
def __init__(self, output, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, path = None): | |
if not mysql_live_execution and not path: | |
raise Exception("error, either mysql_live_execution or path must be specified") | |
self.vn_max_id = vn_max_id | |
self.buffer_size = buffer_size | |
if not threads_number: | |
self.threads_number = cpu_count() | |
else: | |
self.threads_number = threads_number | |
self.mysql_live_execution = mysql_live_execution | |
self.path = path | |
self.output = output | |
if mysql_live_execution: | |
db = MySQLdb.connect(host='myanimespace.com', user='******', passwd='*******', db='myanimespace') | |
db.set_character_set('utf8') | |
cursor = db.cursor() | |
cursor.execute('SET NAMES utf8;') | |
cursor.execute('SET CHARACTER SET utf8;') | |
cursor.execute('SET character_set_connection=utf8;') | |
self._db = db | |
self._cursor = cursor | |
cdef inline getDictFromTable(self, table): | |
res = collections.OrderedDict() | |
for row in table: | |
tds = row.findAll("td") | |
if len(tds) == 2: | |
key = tds[0].text | |
value = tds[1] | |
all_hrefs = value.findAll("a") | |
if not all_hrefs: | |
value = value.text | |
else: | |
value = [a.get("href") for a in all_hrefs] | |
else: | |
desc = row.find("td", attrs={'class': 'vndesc'}) | |
key = desc.find('h2').text | |
value = desc.find('p').text | |
res[key] = value | |
return res | |
def _process_multiple_args(self, args): | |
start, end, writeout_args = args | |
self.process(start, end, writeout_args) | |
cdef inline process(self, start, end, writeout_args = None): | |
tables = collections.OrderedDict() | |
v_count = 0 | |
for v_id in xrange(start, end): | |
try: | |
html = open("%s/%d.html" % (self.path, v_id), "rb").read() | |
print("open %d.html" % v_id) | |
except: | |
print("%d.html is not found, skipping..." % v_id) | |
continue | |
parsed_html = BeautifulSoup(html, "html.parser") | |
vndetails = parsed_html.body.find('div', attrs={'class':'vndetails'}) | |
if not vndetails: | |
print("page %d.html doesn't have vndetails div, skipping..." % v_id) | |
continue | |
try: | |
screens = [img.get('src') for img in parsed_html.body.find('div', attrs={'class':'scr'}).findAll("img")] | |
except: | |
print("screens weren't found on page %d.html" % v_id) | |
screens = u'null' | |
try: | |
img = vndetails.find("p").find("img") | |
except: | |
#catch() | |
print("title img was not found on page %d.html" % v_id) | |
img = u'null' | |
table = self.getDictFromTable(vndetails.findAll("tr")) | |
table[u'image'] = img | |
table[u'screens'] = screens | |
tables[v_id] = table | |
if self.mysql_live_execution: | |
v_count += 1 | |
if v_count > BUFFER_SIZE: | |
print("cur v_id %d" % (v_id)) | |
self.mysqlQueries(tables) | |
tables.clear() | |
v_count = 0 | |
self.db.commit() | |
# если еще остались | |
if (v_id == end) and len(tables): | |
q = self.mysqlQueries(tables) | |
#print("MySQL queries:");pprint(q) | |
if writeout_args: | |
filename, id = writeout_args | |
with self._mutex: | |
s = self.mysqlQueries(tables) | |
f = open(filename, 'ab') | |
f.write(s) | |
f.close() | |
#pprint(s) | |
cdef inline mysqlQueries(self, tables): | |
s = """ | |
INSERT INTO | |
novels | |
( | |
id, | |
romaji, | |
japanese, | |
duration, | |
synopsis_en, | |
posters | |
) | |
VALUES | |
""" | |
if not self.mysql_live_execution: | |
def prepare(v_data, args): | |
s = u""" | |
( | |
'%s', | |
'%s', | |
'%s', | |
'%s', | |
'%s', | |
'%s', | |
),""" | |
args = [unicode(v_data.get(i, u'null')).replace("'", "\\'") for i in args] | |
return s % tuple(args) | |
for v_id in tables: | |
v_data = tables[v_id] | |
s += prepare(v_data, | |
[unicode(v_id), u'Title', u'Original title', u'Length', u'Description', u'image']) | |
s = s[:-1] + u";\n" | |
return s.encode("u8") | |
else: | |
for v_id in tables: | |
t = tables[v_id] | |
placeholders = (str(v_id), t.get(u'Title', 'null'), t.get(u'Original title', 'null'), t.get(u'Length', 'null'), t.get(u'Description', 'null'), t.get(u'image', 'null')) | |
# для определения sql с ошибкой | |
#f = open('mysql.txt', 'w') | |
#f.write(s) | |
#f.write(', '.join(placeholders)) | |
#f.close() | |
self.cursor.execute(s, placeholders) | |
cdef _MUTEX_init(self, mutex, out): | |
self._mutex = mutex | |
try: | |
os.remove(out) | |
except: | |
pass | |
def genDB(self): | |
if not self.mysql_live_execution: | |
c = 0 | |
start, end = 0, self.buffer_size | |
mutex = Lock() | |
pool = Pool(processes = self.threads_number, initializer = self._MUTEX_init, initargs = (self, mutex, self.output)) | |
while c <= self.vn_max_id: | |
args_map = [] | |
for i in range(self.threads_number): | |
if end > self.vn_max_id: | |
end = self.vn_max_id | |
args_map.append((start + 1, end + 1, [self.output, c])) | |
start += self.buffer_size | |
end += self.buffer_size | |
c = end + 1 | |
pool.map_async(self._process_multiple_args, args_map).get(9999999) | |
pool.close() | |
pool.join() | |
else: | |
self.process(3184, self.vn_max_id); | |
self.db.commit() | |
self.db.close() | |
if __name__ == '__main__': | |
PATH = "html/v" | |
VN_MAX_ID = 20137 | |
BUFFER_SIZE = 250 | |
# VNParser(output, vn_max_id, buffer_size, threads_number = None, mysql_live_execution = False, path = PATH) | |
vnparser = VNParser(PATH + "/sql_new.txt", VN_MAX_ID, BUFFER_SIZE, 4, False, PATH) | |
vnparser.genDB() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment