Skip to content

Instantly share code, notes, and snippets.

@KojiAomatsu
Created March 10, 2017 09:53
Show Gist options
  • Save KojiAomatsu/a374a6caca0254beaa2f977811be4acf to your computer and use it in GitHub Desktop.
Save KojiAomatsu/a374a6caca0254beaa2f977811be4acf to your computer and use it in GitHub Desktop.
search popular lyrics
from bs4 import BeautifulSoup
from collections import Counter
import urllib.request as req
import re
import time
links = []
results = []
for i in range(1, 101):
url = "http://j-lyric.net/lyric/p{0}.html".format(i)
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
refs = soup.find("div", id="rankingBlock").find_all("a", class_="title")
for ref in refs: links.append("http://j-lyric.net" + ref.attrs["href"])
print("{0} is done.".format(i))
for link in links:
url2 = link
res2 = req.urlopen(url2)
soup2 = BeautifulSoup(res2, "html.parser")
lyrics = str(soup2.find("p", id="lyricBody"))
lyrics = lyrics.replace("\n", "").replace("\r", "").replace("\u3000", "")
pattern = r"<.*?>"
lines = re.split(pattern, lyrics)
group = []
for line in lines:
engline = ""
for c in line:
if ord(c) < 128:
engline += c
engline = engline.replace("(", "").replace(")", "").replace(":", "").replace("!", "")
engline = engline.strip()
engline = engline.lower()
if not len(engline) < 7:
group.append(engline)
group_uniq = list(set(group))
for element in group_uniq:
results.append(element)
print("{0} is done.".format(url2))
time.sleep(0.2)
counter = Counter(results)
for lin, cnt in counter.most_common(50):
print("{0} ({1})".format(lin, cnt))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment