-
-
Save cryzed/3ce353a451c508399dcbfb0d8ba6e7b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fnmatch | |
import re | |
import xmlrpclib | |
from base_adapter import BaseSiteAdapter, makeDate | |
def get_text(soup): | |
return soup.get_text(separator=' ', strip=True) | |
class NovelUpdatesComAdapter(BaseSiteAdapter): | |
_SITE_DOMAIN = 'novelupdates.com' | |
_SITE_URL_PATTERN = r'https?://%s/series/(?P<id>.+)(/.*)?' % re.escape(_SITE_DOMAIN) | |
_GET_CHAPTERS_URL = 'https://novelupdates.com/wp-admin/admin-ajax.php' | |
_DATE_FORMAT = '%m/%d/%y' | |
def __init__(self, configuration, url): | |
super(NovelUpdatesComAdapter, self).__init__(configuration, url) | |
self.story.setMetadata('storyId', re.match(self._SITE_URL_PATTERN, url).group(1)) | |
self.story.setMetadata('siteabbrev', self._SITE_DOMAIN) | |
self._newspaper_client = xmlrpclib.ServerProxy('http://localhost:8100/') | |
@classmethod | |
def _external_chapter_url_predicate(cls, url): | |
return url and fnmatch.fnmatch(url, 'http*://*%s/extnu/*' % cls._SITE_DOMAIN) | |
@classmethod | |
def _goto_chapter_url_predicate(cls, url): | |
return url and fnmatch.fnmatch(url, 'http*://*%s/nu_goto_chapter.php*' % cls._SITE_DOMAIN) | |
@staticmethod | |
def getSiteDomain(): | |
return NovelUpdatesComAdapter._SITE_DOMAIN | |
def getSiteURLPattern(self): | |
return self._SITE_URL_PATTERN | |
@classmethod | |
def getSiteExampleURLs(cls): | |
return ['http://%s/series/title' % cls._SITE_DOMAIN] | |
def doExtractChapterUrlsAndMetadata(self, get_cover=True): | |
soup = self.make_soup(self._fetchUrl(self.url)) | |
self.story.setMetadata('title', get_text(soup.select_one('.seriestitlenu'))) | |
for a in soup.select('#showauthors > a'): | |
self.story.addToList('author', get_text(a)) | |
self.story.addToList('authorId', a['href'].rstrip('/').rsplit('/', 1)[1]) | |
self.story.addToList('authorUrl', a['href']) | |
self.story.setMetadata('dateUpdated', makeDate(get_text(soup.select_one('#myTable > tbody td')), self._DATE_FORMAT)) | |
self.setDescription(self.url, get_text(soup.select_one('#editdescription'))) | |
mypostid = soup.select_one('#mypostid')['value'] | |
parameters = {'action': 'nd_getchapters', 'mygrr': 0, 'mypostid': mypostid} | |
soup2 = self.make_soup(self._postUrl(self._GET_CHAPTERS_URL, parameters)) | |
for a in reversed(soup2('a', href=self._external_chapter_url_predicate)): | |
self.chapterUrls.append((get_text(a), a['href'])) | |
first_chapter_url = soup2('a', href=self._goto_chapter_url_predicate)[-1]['href'] | |
soup3 = self.make_soup(self._fetchUrl(first_chapter_url)) | |
self.story.setMetadata('datePublished', makeDate(get_text(soup3.select('#myTable > tbody > tr')[-1].td), self._DATE_FORMAT)) | |
def getChapterText(self, url): | |
html = self._fetchUrl(url) | |
return self._newspaper_client.get_newspaper_article_html(url, html) | |
def getClass(): | |
return NovelUpdatesComAdapter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment