Created
March 10, 2017 09:53
-
-
Save KojiAomatsu/a374a6caca0254beaa2f977811be4acf to your computer and use it in GitHub Desktop.
search popular lyrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from collections import Counter | |
import urllib.request as req | |
import re | |
import time | |
links = [] | |
results = [] | |
for i in range(1, 101): | |
url = "http://j-lyric.net/lyric/p{0}.html".format(i) | |
res = req.urlopen(url) | |
soup = BeautifulSoup(res, "html.parser") | |
refs = soup.find("div", id="rankingBlock").find_all("a", class_="title") | |
for ref in refs: links.append("http://j-lyric.net" + ref.attrs["href"]) | |
print("{0} is done.".format(i)) | |
for link in links: | |
url2 = link | |
res2 = req.urlopen(url2) | |
soup2 = BeautifulSoup(res2, "html.parser") | |
lyrics = str(soup2.find("p", id="lyricBody")) | |
lyrics = lyrics.replace("\n", "").replace("\r", "").replace("\u3000", "") | |
pattern = r"<.*?>" | |
lines = re.split(pattern, lyrics) | |
group = [] | |
for line in lines: | |
engline = "" | |
for c in line: | |
if ord(c) < 128: | |
engline += c | |
engline = engline.replace("(", "").replace(")", "").replace(":", "").replace("!", "") | |
engline = engline.strip() | |
engline = engline.lower() | |
if not len(engline) < 7: | |
group.append(engline) | |
group_uniq = list(set(group)) | |
for element in group_uniq: | |
results.append(element) | |
print("{0} is done.".format(url2)) | |
time.sleep(0.2) | |
counter = Counter(results) | |
for lin, cnt in counter.most_common(50): | |
print("{0} ({1})".format(lin, cnt)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment