Skip to content

Instantly share code, notes, and snippets.

@rachmadaniHaryono
Created January 31, 2018 00:15
Show Gist options
  • Save rachmadaniHaryono/e7d40fcc5b9cd6ecc1f9151c4f0f5d84 to your computer and use it in GitHub Desktop.
Save rachmadaniHaryono/e7d40fcc5b9cd6ecc1f9151c4f0f5d84 to your computer and use it in GitHub Desktop.
# part 1 import
import inspect
import os
from gallery_dl import extractor as gallery_dl_extractor
from gallery_dl.job import Job
from yapsy.PluginManager import PluginManager
import structlog
###############################################################################
# part 2 import
import re
from bs4 import BeautifulSoup
from gallery_dl.extractor.common import Extractor, Message
from yapsy.IPlugin import IPlugin
###############################################################################
log = structlog.getLogger(__name__)
class CacheJob(Job):
def __init__(self, url, parent=None, extractor_paths=None):
if extractor_paths is None:
extractor_paths = []
extractor_module = None # should be program default module
extractor_paths.append(os.path.dirname(inspect.getfile(extractor_module)))
# Build the manager
simplePluginManager = PluginManager()
# Tell it the default place(s) where to find plugins
simplePluginManager.setPluginPlaces(extractor_paths)
# Load all plugins
simplePluginManager.collectPlugins()
for pluginInfo in simplePluginManager.getAllPlugins():
# Activate all loaded plugins
simplePluginManager.activatePluginByName(pluginInfo.name)
# pluginInfo.plugin_object basically PluginExtractor module
for extractor_set in pluginInfo.plugin_object.get_extractor_sets():
if extractor_set not in gallery_dl_extractor._cache:
log.debug('extractor added', ext_set=extractor_set)
gallery_dl_extractor._cache.append(extractor_set)
Job.__init__(self, url, parent)
self.data = []
def run(self):
self.data = [msg for msg in self.extractor]
###############################################################################
log = structlog.getLogger(__name__)
class ModuleExtractor(Extractor):
pattern = [r'http:\/\/www\.example.com']
def __init__(self, match):
Extractor.__init__(self)
self.metadata = {
'url': match.string,
'data_key1': 'data1',
'data_key2': 'data2',
}
@staticmethod
def parse(soup, url=None):
metadata = {}
message_url = []
message_queue = []
# processing the html soup
# ...
return {
'metadata': metadata,
'message_url': message_url,
'message_queue': message_queue,
}
def items(self):
url = self.metadata['url']
resp = self.request(url)
soup = BeautifulSoup(resp.text, 'html.parser')
parse_res = ModuleExtractor.parse(soup, url=url)
self.metadata.update(parse_res.pop('metadata'))
yield Message.Directory, self.metadata
for item in parse_res.pop('message_url'):
yield Message.Url, item[0], item[1]
for item in parse_res.pop('message_queue'):
yield Message.Queue, item[0], item[1]
class PluginExtractor(IPlugin):
@staticmethod
def get_extractor_sets():
for regex_item in ModuleExtractor.pattern:
yield re.compile(regex_item), ModuleExtractor
@Rockstar43
Copy link

???

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment