Skip to content

Instantly share code, notes, and snippets.

@allieus
Created August 19, 2016 04:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allieus/e870ce9a5d136884e0e55a27e63b5014 to your computer and use it in GitHub Desktop.
Save allieus/e870ce9a5d136884e0e55a27e63b5014 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
def fetch_page(url):
r = requests.get(url)
return r.text
def get_tags_from_page(url):
html = fetch_page(url)
#soup = BeautifulSoup(html, "html.parser")
soup = BeautifulSoup(html, "lxml")
contents = soup.find_all('div',
{'class': re.compile('(item|composer|work|track)+-container$') }
)
composer, title_head, album, composer, title_head, title_body = [{}]*6
for t in contents:
crt_class = t.attrs['class'][0]
tmp = t.get_text().replace('\n','')
if 'item-container' in crt_class:
album = tmp
elif 'composer-container' in crt_class:
composer = re.sub(" \(\d{4} - \d{4}\)", '', tmp)
elif 'work-container' in crt_class:
title_head = tmp
elif 'track-container' in crt_class:
title_body = re.sub("\d{1}?\:\d{3,4}\:\d{2}", '', tmp)
if 'track-container' in crt_class:
print(
album, '__!__', composer, '__!__', title_head, '-', title_body
) # for tag
if contents.index(t) > 40:
break
return
url = "http://www.deutschegrammophon.com/kr/cat/4796220"
get_tags_from_page(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment