Last active
December 12, 2020 17:13
-
-
Save paulpopus/40f3d21cf7676f56a891d027b38a4d97 to your computer and use it in GitHub Desktop.
Searching for a keyword in a list of songs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" /requirements.txt | |
beautifulsoup4==4.9.3 | |
lxml==4.6.2 | |
soupsieve==2.1 | |
""" | |
from bs4 import BeautifulSoup | |
from urllib.request import Request, urlopen | |
import lxml | |
import re | |
import sys | |
# Primary page we will scrape for all the URLs we will check | |
targetUrl = 'http://www.songlyrics.com/j-cole-lyrics/' | |
# What string we will check the URL contains, | |
# for this website each song name in the URL is appended to the artist name | |
urlContains = 'j-cole' | |
# The word that we will be looking for | |
keyword = 'kanye' | |
# We will store results in this and then print them to console | |
results = {} | |
req = Request(targetUrl, headers = {'User-Agent': 'Mozilla/5.0'}) | |
mainPage = urlopen(req).read() | |
htmlString = mainPage.decode('utf8', 'ignore') | |
soup = BeautifulSoup(htmlString, 'lxml') | |
songList = soup.find_all(href = re.compile(urlContains), itemprop = 'url') | |
# Returns true if the keyword is found in the lyrics | |
def isKeywordInLyrics(lyrics): | |
if keyword in lyrics: | |
return True | |
# Makes a request to a URL and returns the lyrics as a string | |
def getLyricsFromUrl(url): | |
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'}) | |
lyricsPage = urlopen(req).read() | |
lyricsString = lyricsPage.decode('utf8', 'ignore') | |
lyricsSoup = BeautifulSoup(lyricsString, 'lxml') | |
lyrics = lyricsSoup.find_all('p', id = 'songLyricsDiv') | |
return str(lyrics) | |
if len(songList) > 0: | |
totalNumber = len(songList) | |
index = 0 | |
sys.stdout.write('Checking {index} of {total} songs'.format(index = index, total = totalNumber)) | |
sys.stdout.flush() | |
for song in songList: | |
index += 1 | |
sys.stdout.write('\rChecking {index} of {total} songs\r'.format(index = index, total = totalNumber)) | |
for attribute, value in song.attrs.items(): | |
if attribute == 'href': | |
lyrics = getLyricsFromUrl(value) | |
if isKeywordInLyrics(lyrics): | |
print('Found in {song}'.format(song = value)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment