Skip to content

Instantly share code, notes, and snippets.

@cryzed
Created August 27, 2017 19:16
Show Gist options
  • Save cryzed/3ce353a451c508399dcbfb0d8ba6e7b4 to your computer and use it in GitHub Desktop.
Save cryzed/3ce353a451c508399dcbfb0d8ba6e7b4 to your computer and use it in GitHub Desktop.
import fnmatch
import re
import xmlrpclib
from base_adapter import BaseSiteAdapter, makeDate
def get_text(soup):
return soup.get_text(separator=' ', strip=True)
class NovelUpdatesComAdapter(BaseSiteAdapter):
_SITE_DOMAIN = 'novelupdates.com'
_SITE_URL_PATTERN = r'https?://%s/series/(?P<id>.+)(/.*)?' % re.escape(_SITE_DOMAIN)
_GET_CHAPTERS_URL = 'https://novelupdates.com/wp-admin/admin-ajax.php'
_DATE_FORMAT = '%m/%d/%y'
def __init__(self, configuration, url):
super(NovelUpdatesComAdapter, self).__init__(configuration, url)
self.story.setMetadata('storyId', re.match(self._SITE_URL_PATTERN, url).group(1))
self.story.setMetadata('siteabbrev', self._SITE_DOMAIN)
self._newspaper_client = xmlrpclib.ServerProxy('http://localhost:8100/')
@classmethod
def _external_chapter_url_predicate(cls, url):
return url and fnmatch.fnmatch(url, 'http*://*%s/extnu/*' % cls._SITE_DOMAIN)
@classmethod
def _goto_chapter_url_predicate(cls, url):
return url and fnmatch.fnmatch(url, 'http*://*%s/nu_goto_chapter.php*' % cls._SITE_DOMAIN)
@staticmethod
def getSiteDomain():
return NovelUpdatesComAdapter._SITE_DOMAIN
def getSiteURLPattern(self):
return self._SITE_URL_PATTERN
@classmethod
def getSiteExampleURLs(cls):
return ['http://%s/series/title' % cls._SITE_DOMAIN]
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
soup = self.make_soup(self._fetchUrl(self.url))
self.story.setMetadata('title', get_text(soup.select_one('.seriestitlenu')))
for a in soup.select('#showauthors > a'):
self.story.addToList('author', get_text(a))
self.story.addToList('authorId', a['href'].rstrip('/').rsplit('/', 1)[1])
self.story.addToList('authorUrl', a['href'])
self.story.setMetadata('dateUpdated', makeDate(get_text(soup.select_one('#myTable > tbody td')), self._DATE_FORMAT))
self.setDescription(self.url, get_text(soup.select_one('#editdescription')))
mypostid = soup.select_one('#mypostid')['value']
parameters = {'action': 'nd_getchapters', 'mygrr': 0, 'mypostid': mypostid}
soup2 = self.make_soup(self._postUrl(self._GET_CHAPTERS_URL, parameters))
for a in reversed(soup2('a', href=self._external_chapter_url_predicate)):
self.chapterUrls.append((get_text(a), a['href']))
first_chapter_url = soup2('a', href=self._goto_chapter_url_predicate)[-1]['href']
soup3 = self.make_soup(self._fetchUrl(first_chapter_url))
self.story.setMetadata('datePublished', makeDate(get_text(soup3.select('#myTable > tbody > tr')[-1].td), self._DATE_FORMAT))
def getChapterText(self, url):
html = self._fetchUrl(url)
return self._newspaper_client.get_newspaper_article_html(url, html)
def getClass():
return NovelUpdatesComAdapter
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment