Skip to content

Instantly share code, notes, and snippets.

@paulpopus
Last active December 12, 2020 17:13
Show Gist options
  • Save paulpopus/40f3d21cf7676f56a891d027b38a4d97 to your computer and use it in GitHub Desktop.
Save paulpopus/40f3d21cf7676f56a891d027b38a4d97 to your computer and use it in GitHub Desktop.
Searching for a keyword in a list of songs
""" /requirements.txt
beautifulsoup4==4.9.3
lxml==4.6.2
soupsieve==2.1
"""
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import lxml
import re
import sys
# Primary page we will scrape for all the URLs we will check
targetUrl = 'http://www.songlyrics.com/j-cole-lyrics/'
# What string we will check the URL contains,
# for this website each song name in the URL is appended to the artist name
urlContains = 'j-cole'
# The word that we will be looking for
keyword = 'kanye'
# We will store results in this and then print them to console
results = {}
req = Request(targetUrl, headers = {'User-Agent': 'Mozilla/5.0'})
mainPage = urlopen(req).read()
htmlString = mainPage.decode('utf8', 'ignore')
soup = BeautifulSoup(htmlString, 'lxml')
songList = soup.find_all(href = re.compile(urlContains), itemprop = 'url')
# Returns true if the keyword is found in the lyrics
def isKeywordInLyrics(lyrics):
if keyword in lyrics:
return True
# Makes a request to a URL and returns the lyrics as a string
def getLyricsFromUrl(url):
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
lyricsPage = urlopen(req).read()
lyricsString = lyricsPage.decode('utf8', 'ignore')
lyricsSoup = BeautifulSoup(lyricsString, 'lxml')
lyrics = lyricsSoup.find_all('p', id = 'songLyricsDiv')
return str(lyrics)
if len(songList) > 0:
totalNumber = len(songList)
index = 0
sys.stdout.write('Checking {index} of {total} songs'.format(index = index, total = totalNumber))
sys.stdout.flush()
for song in songList:
index += 1
sys.stdout.write('\rChecking {index} of {total} songs\r'.format(index = index, total = totalNumber))
for attribute, value in song.attrs.items():
if attribute == 'href':
lyrics = getLyricsFromUrl(value)
if isKeywordInLyrics(lyrics):
print('Found in {song}'.format(song = value))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment