Last active
July 27, 2019 20:49
-
-
Save saburbutt/8955aa6bc4cf066e48b3b5a5b0b151c3 to your computer and use it in GitHub Desktop.
LYRICS GENERATION USING HIDDEN MARKOV MODELS WITH UPDATED CODE in PYTHON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, Any | |
from html.parser import HTMLParser | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
import nltk | |
import pandas as pd | |
from requests import get | |
from bs4 import BeautifulSoup | |
import random, re #re helps with the text formating | |
from matplotlib import pyplot as plt | |
import string | |
lyrics_words_by_user = input("Enter One keyword for lyrics:") | |
#the first task would be to take a keyword or a set of keywords that will be used in the genration | |
of lyrics | |
word_tokenized = word_tokenize(lyrics_words_by_user) | |
#Now that we have the keywords, lets scrap data that are related to the keywords | |
url = 'https://www.lyrics.com/lyrics/' + str(word_tokenized) | |
print(url) | |
response = get(url) | |
html_soup = BeautifulSoup(response.text, 'html.parser') | |
type(html_soup) | |
#These keywords will be used to download the lyrics that are related to them | |
song_lyrics = html_soup.find_all('',class_ = 'lyric-body') | |
for i in song_lyrics: | |
for j in i: | |
j = str(j) | |
cleaner = re.compile('<.*?>') | |
j = re.sub(cleaner, '', j) | |
j.replace(',', '') | |
j.replace(')', '') | |
j.replace('(', '') | |
appendFile = open('lyrics1.txt', 'a') | |
appendFile.write('\n') | |
appendFile.write(j) | |
appendFile.close() | |
#the lyrics will be stored in the file that will be used to train the next set of lyrics that rhyme | |
def testMarkov(startword): | |
LyricsLib = {} | |
addToLib('lyrics1.txt', LyricsLib) | |
return makeLyrics(startword, LyricsLib) | |
def addToLib(filename, currLib): | |
f = open(filename,'r') #opens the lyrics file that is being added everytime | |
words = re.sub("\n", "\n", f.read()).split(' ') | |
curr= 0 | |
while curr < len(words) - 1: | |
#looping through all the words including \n in this song | |
currWord = words[curr].lower() | |
nextWord = words[curr + 1].lower() | |
if currWord in currLib.keys(): | |
#Checks if the word is a new word | |
if nextWord in currLib[currWord].keys(): | |
#if we have seen the sequence than currWord -> nextWord before | |
currLib[currWord][nextWord] += 1 | |
else: | |
#Havent seen the sequence | |
currLib[currWord][nextWord] = 1 | |
else: | |
#I haven't seen this word | |
currLib[currWord] = {nextWord: 1} | |
curr += 1 | |
#Change counts to percentages values | |
for key in currLib.keys(): | |
#for each word | |
keyTotal = 0 | |
TwoDarray = 0 | |
for probKey in currLib[key].keys(): | |
keyTotal += currLib[key][probKey] | |
for probKey in currLib[key].keys(): | |
currLib[key][probKey] = currLib[key][probKey]/keyTotal | |
print('\n', currLib) | |
return currLib | |
def makeLyrics(startword, probDict): #set of lyrics and start words as paratmeters | |
lyrics, curr, wc = '', startword, 0 | |
while wc < 50: | |
lyrics += curr + ' ' | |
curr = markov_next(curr, probDict) | |
wc += 1 | |
return lyrics | |
#Hidden Markov Model | |
def markov_next(currword, probDict): | |
#Either returns a random word if it is novel or finds probabiliticly | |
if currword not in probDict.keys(): | |
return random.choice(list(probDict.keys())) | |
else: | |
wordprobs = probDict[currword] | |
randProb = random.uniform(0.0, 1.0) | |
currProb = 0.0 | |
for key in wordprobs: | |
currProb += wordprobs[key] | |
if randProb <= currProb: | |
return key | |
return random.choice(probDict.keys()) | |
corpus = open('lyrics1.txt', "r") | |
inputfile =corpus.read() | |
words = nltk.tokenize.word_tokenize(inputfile) | |
count = set(words) | |
dict = nltk.FreqDist(words) | |
dict.plot() | |
plt.plot(words) | |
plt.show() | |
#Input of the user | |
compare = testMarkov(lyrics_words_by_user) | |
#compare_tokenized = word_tokenized(compare) | |
#print(testMarkov(lyrics_words_by_user)) | |
print(compare) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The code below can be divided in three steps. The first step was to generate a corpus that is attached to Lyrics.com repository. I used Beautifulsoup Python library to scrap the relevant lyrics based on the keyword given by the user. The keyword tells us what the
subject of the automatic generated lyrics required by the user. The purpose of attaching an online repository was to always include the relevant keyword details in case nothing relevant already available in the corpus. The second step was to remove all the noise that came along with the data extracted from Lyrics.com to be able to process it properly. The third step includes the corpus to be trained on the Hidden Markov Model to predict the output. We also tracked the sequence of the current words so that they are not repeated in the next iterative process. The results show lyrics generated based on the keyword given that are both meaningful and rhythmic. Though a lot more can be done to give the literature and work that is already available on the subject but I’ve limited the work according to the scope of the project. The complete code with corpus can be provided on request to the author.