k5trismegistus/scraper

## scraper
import urllib.request
import bs4
import re


class LexiconScraper():

    def __init__(self, url):
        html = urllib.request.urlopen(url).read().decode('utf-8')
        self.soup = bs4.BeautifulSoup(html)

    def get_others(self):

        circle = []
        author = []
        parody = []

        raw_circle = self.soup.find_all(href=re.compile('/browse/circle/[0123456789]+/'))
        raw_author = self.soup.find_all(href=re.compile('/browse/author/[0123456789]+/'))
        raw_parody = self.soup.find_all(href=re.compile('/browse/parody/[0123456789]+/'))

        for c in raw_circle:
            circle.append(c.string)
        for a in raw_author:
            author.append(a.string)
        for p in raw_parody:
            parody.append(p.string)

        print('circle: ', end='')
        print(circle[0::2])
        print('author: ', end='')
        print(author[0::2])
        print('parody: ', end='')
        print(parody[0::2])

    def get_title(self):
        raw_title = self.soup.find('td', text='原題:')
        raw_title = raw_title.next_sibling
        title = raw_title.string

        print('title:', end='' )
        print(title)

    def get_date(self):
        '''return release date in list ['year', 'month', 'day']'''
        raw_date = self.soup.find('td', text='発行日:')
        raw_date = raw_date.next_sibling
        raw_date = raw_date.string
        date = raw_date.split('-')

        print('date: ', end='')
        print(date)

if __name__ == '__main__':
    b = LexiconScraper('http://www.doujinshi.org/book/150813/')
    b.get_title()
    b.get_others()
    b.get_date()
	import urllib.request
	import bs4
	import re


	class LexiconScraper():

	def __init__(self, url):
	html = urllib.request.urlopen(url).read().decode('utf-8')
	self.soup = bs4.BeautifulSoup(html)

	def get_others(self):

	circle = []
	author = []
	parody = []

	raw_circle = self.soup.find_all(href=re.compile('/browse/circle/[0123456789]+/'))
	raw_author = self.soup.find_all(href=re.compile('/browse/author/[0123456789]+/'))
	raw_parody = self.soup.find_all(href=re.compile('/browse/parody/[0123456789]+/'))

	for c in raw_circle:
	circle.append(c.string)
	for a in raw_author:
	author.append(a.string)
	for p in raw_parody:
	parody.append(p.string)

	print('circle: ', end='')
	print(circle[0::2])
	print('author: ', end='')
	print(author[0::2])
	print('parody: ', end='')
	print(parody[0::2])

	def get_title(self):
	raw_title = self.soup.find('td', text='原題:')
	raw_title = raw_title.next_sibling
	title = raw_title.string

	print('title:', end='' )
	print(title)

	def get_date(self):
	'''return release date in list ['year', 'month', 'day']'''
	raw_date = self.soup.find('td', text='発行日:')
	raw_date = raw_date.next_sibling
	raw_date = raw_date.string
	date = raw_date.split('-')

	print('date: ', end='')
	print(date)

	if __name__ == '__main__':
	b = LexiconScraper('http://www.doujinshi.org/book/150813/')
	b.get_title()
	b.get_others()
	b.get_date()