Created
January 31, 2018 00:15
-
-
Save rachmadaniHaryono/e7d40fcc5b9cd6ecc1f9151c4f0f5d84 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# part 1 import | |
import inspect | |
import os | |
from gallery_dl import extractor as gallery_dl_extractor | |
from gallery_dl.job import Job | |
from yapsy.PluginManager import PluginManager | |
import structlog | |
############################################################################### | |
# part 2 import | |
import re | |
from bs4 import BeautifulSoup | |
from gallery_dl.extractor.common import Extractor, Message | |
from yapsy.IPlugin import IPlugin | |
############################################################################### | |
log = structlog.getLogger(__name__) | |
class CacheJob(Job): | |
def __init__(self, url, parent=None, extractor_paths=None): | |
if extractor_paths is None: | |
extractor_paths = [] | |
extractor_module = None # should be program default module | |
extractor_paths.append(os.path.dirname(inspect.getfile(extractor_module))) | |
# Build the manager | |
simplePluginManager = PluginManager() | |
# Tell it the default place(s) where to find plugins | |
simplePluginManager.setPluginPlaces(extractor_paths) | |
# Load all plugins | |
simplePluginManager.collectPlugins() | |
for pluginInfo in simplePluginManager.getAllPlugins(): | |
# Activate all loaded plugins | |
simplePluginManager.activatePluginByName(pluginInfo.name) | |
# pluginInfo.plugin_object basically PluginExtractor module | |
for extractor_set in pluginInfo.plugin_object.get_extractor_sets(): | |
if extractor_set not in gallery_dl_extractor._cache: | |
log.debug('extractor added', ext_set=extractor_set) | |
gallery_dl_extractor._cache.append(extractor_set) | |
Job.__init__(self, url, parent) | |
self.data = [] | |
def run(self): | |
self.data = [msg for msg in self.extractor] | |
############################################################################### | |
log = structlog.getLogger(__name__) | |
class ModuleExtractor(Extractor): | |
pattern = [r'http:\/\/www\.example.com'] | |
def __init__(self, match): | |
Extractor.__init__(self) | |
self.metadata = { | |
'url': match.string, | |
'data_key1': 'data1', | |
'data_key2': 'data2', | |
} | |
@staticmethod | |
def parse(soup, url=None): | |
metadata = {} | |
message_url = [] | |
message_queue = [] | |
# processing the html soup | |
# ... | |
return { | |
'metadata': metadata, | |
'message_url': message_url, | |
'message_queue': message_queue, | |
} | |
def items(self): | |
url = self.metadata['url'] | |
resp = self.request(url) | |
soup = BeautifulSoup(resp.text, 'html.parser') | |
parse_res = ModuleExtractor.parse(soup, url=url) | |
self.metadata.update(parse_res.pop('metadata')) | |
yield Message.Directory, self.metadata | |
for item in parse_res.pop('message_url'): | |
yield Message.Url, item[0], item[1] | |
for item in parse_res.pop('message_queue'): | |
yield Message.Queue, item[0], item[1] | |
class PluginExtractor(IPlugin): | |
@staticmethod | |
def get_extractor_sets(): | |
for regex_item in ModuleExtractor.pattern: | |
yield re.compile(regex_item), ModuleExtractor |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
???