Skip to content

Instantly share code, notes, and snippets.

@siketyan
Last active November 26, 2021 15:47
Show Gist options
  • Save siketyan/a43cbf286d4c70b2abf2dc40bc04900a to your computer and use it in GitHub Desktop.
Save siketyan/a43cbf286d4c70b2abf2dc40bc04900a to your computer and use it in GitHub Desktop.
from typing import Any
from urllib import request, parse as urlparse
from re import compile, Pattern
from json import dumps
from sys import stderr
from signal import signal, SIGPIPE, SIG_DFL
from bs4 import BeautifulSoup
signal(SIGPIPE, SIG_DFL)
BASE_URL = 'https://libe-tokyo.com/'
PATTERN_GENERAL = compile(r'.+[::](?P<text>.+)')
PATTERN_NUMBER = compile(r'.+[::](?P<text>\d+)')
PATTERN_NAME = compile(r'名前[::](?:\[(?P<tag>.+)\])?(?P<name>[^(()]+)(?:[((](?P<ruby>.+)[))])?')
def soup_open(path: str) -> BeautifulSoup:
url = urlparse.urljoin(BASE_URL, path)
print('Opening %s' % url, file=stderr)
with request.urlopen(url) as response:
return BeautifulSoup(response, 'html.parser')
def soup_find(src: BeautifulSoup, pattern: Pattern, prefix: str = None, group: str = 'text') -> Any:
for tag in src.find_all('dd'):
if prefix != None and not tag.text.startswith(prefix):
continue
m = pattern.match(tag.text)
if m == None:
continue
if group != None:
return m.group(group)
return m.groupdict()
def int_safe(src: str) -> int:
if src == None:
return None
return int(src)
def parse_bool(src: BeautifulSoup, prefix: str) -> bool:
value = soup_find(src, PATTERN_GENERAL, prefix)
if value == None:
return None
return value == '○' or value == '◎'
def parse_girl(path: str) -> dict:
soup = soup_open(path)
profile_section = soup.find('dt', attrs={'class': 'profile'}, text='プロフィール').parent
options_section = soup.find('dt', attrs={'class': 'profile'}, text='可能プレイ&オプション').parent
girl = {
'name': soup_find(profile_section, PATTERN_NAME, group=None),
'age': int_safe(soup_find(profile_section, PATTERN_NUMBER, '年齢')),
'sizes': {
't': int_safe(soup_find(profile_section, PATTERN_NUMBER, 'T')),
'b': int_safe(soup_find(profile_section, PATTERN_NUMBER, 'B')),
'p': int_safe(soup_find(profile_section, PATTERN_NUMBER, 'P')),
},
'types': soup_find(profile_section, PATTERN_GENERAL, 'タイプ').split('/'),
'options': {
'af': parse_bool(options_section, 'アナル受け'),
'reversed_af': parse_bool(options_section, '逆アナル'),
'3p': parse_bool(options_section, '3P'),
'reversed_3p': parse_bool(options_section, '逆3P'),
'soft_s': parse_bool(options_section, 'ソフトS'),
'soft_m': parse_bool(options_section, 'ソフトM'),
},
}
return girl
soup = soup_open('/index2.html')
sections = soup.find_all('div',attrs={
"class": ["girls-left", "girls-right"],
})
girls = []
for section in sections:
for anchor in section.find_all('a'):
girls.append(parse_girl(anchor['href']))
print(dumps(girls))
curl "https://gist.githubusercontent.com/siketyan/a43cbf286d4c70b2abf2dc40bc04900a/raw/e1b4440a35a41250d59dd55d5a92d5a804ff9f62/libe.py" | python3 | jq "map(select(.options.reversed_af)) | sort_by(.sizes.p) | reverse | map(.name.name)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment