Skip to content

Instantly share code, notes, and snippets.

@k5trismegistus
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k5trismegistus/d99cc76c45b3f7b6c5fe to your computer and use it in GitHub Desktop.
Save k5trismegistus/d99cc76c45b3f7b6c5fe to your computer and use it in GitHub Desktop.
class LexiconScraper
import urllib.request
import bs4
import re
class LexiconScraper():
def __init__(self, url):
html = urllib.request.urlopen(url).read().decode('utf-8')
self.soup = bs4.BeautifulSoup(html)
def get_others(self):
circle = []
author = []
parody = []
raw_circle = self.soup.find_all(href=re.compile('/browse/circle/[0123456789]+/'))
raw_author = self.soup.find_all(href=re.compile('/browse/author/[0123456789]+/'))
raw_parody = self.soup.find_all(href=re.compile('/browse/parody/[0123456789]+/'))
for c in raw_circle:
circle.append(c.string)
for a in raw_author:
author.append(a.string)
for p in raw_parody:
parody.append(p.string)
print('circle: ', end='')
print(circle[0::2])
print('author: ', end='')
print(author[0::2])
print('parody: ', end='')
print(parody[0::2])
def get_title(self):
raw_title = self.soup.find('td', text='原題:')
raw_title = raw_title.next_sibling
title = raw_title.string
print('title:', end='' )
print(title)
def get_date(self):
'''return release date in list ['year', 'month', 'day']'''
raw_date = self.soup.find('td', text='発行日:')
raw_date = raw_date.next_sibling
raw_date = raw_date.string
date = raw_date.split('-')
print('date: ', end='')
print(date)
if __name__ == '__main__':
b = LexiconScraper('http://www.doujinshi.org/book/150813/')
b.get_title()
b.get_others()
b.get_date()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment