Created
July 29, 2019 11:32
-
-
Save bvenkatr/f588f1f655a86f190532e9baf43ee960 to your computer and use it in GitHub Desktop.
scraping Mohammed_Rafi songs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Get the URL for Rafi's song list that matches with your first letter of your name | |
* Main songs list url: https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Mohammed_Rafi | |
* Make a request to get data from the given URL and find the link that is relavent for | |
* list of songs | |
* | |
* @params startLetter - This is the letter to get the link for | |
* @return URL for list of songs matches with first letter of your name | |
*/ | |
let fetch = require("node-fetch"); | |
var cheerio = require('cheerio'); | |
async function getRafiMusicLink(startLetter) { | |
let songLinksDocument = await fetch(`https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Mohammed_Rafi`); | |
let songLinksDocumentInTextFormat = await songLinksDocument.text(); | |
let listOfSongsTags = songLinksDocumentInTextFormat.match(/<a href="(\/wiki\/List_of_songs_recorded_by_Mohammed_Rafi_(\([A-Z]\)|\(B%E2%80%93C\)|\(D%E2%80%93F\)|\(H%E2%80%93I\)|\(P%E2%80%93R\)|\(U%E2%80%93Z\)))/gm); | |
let textToMatch = `(${startLetter})`; | |
console.log(`The letter to get the songs link is ${textToMatch}`); | |
if (["B", "C"].indexOf(startLetter) !== -1) { | |
textToMatch = `(B%E2%80%93C)`; | |
} else if (["D", "E", "F"].indexOf(startLetter) !== -1) { | |
textToMatch = `(D%E2%80%93F)`; | |
} else if (["H", "I"].indexOf(startLetter) !== -1) { | |
textToMatch = `(H%E2%80%93I)`; | |
} else if (["P", "Q", "R"].indexOf(startLetter) !== -1) { | |
textToMatch = `(P%E2%80%93R)`; | |
} else if (["U", "V", "W", "X", "Y", "Z"].indexOf(startLetter) !== -1) { | |
textToMatch = `(U%E2%80%93Z)`; | |
} | |
console.log(`The letter to get the songs link is ${textToMatch}`); | |
for (let urlText of listOfSongsTags) { | |
if (urlText.indexOf(textToMatch) !== -1) { | |
let url = urlText.match(/<a href="(.*)$/); | |
return url ? `https://en.wikipedia.org${url[1]}` : null; | |
} | |
} | |
} | |
/** | |
* Give the URL for list of songs get all songs that are listed in the URL | |
* | |
* Parse the data comes from the URL and create an array of song object like: | |
* {"name": "Name of the song", "year": year, "movie": "Movie of the song"} | |
* | |
* | |
* @return Array of songs | |
*/ | |
async function getListOfSongs(songsURL) { | |
console.log(`Getting songs for url ${songsURL}`); | |
let songsDocument = await fetch(songsURL); | |
let songsInTextFormat = await songsDocument.text(); | |
var $ = cheerio.load(songsInTextFormat); | |
console.log("gettign songs...."); | |
let songsList = []; | |
for (let songsLiEle of $('.mw-parser-output ul > li').toArray()) { | |
let songLiStr = songsLiEle.children[0].data; | |
if (songLiStr) { | |
if (songLiStr[1] === process.argv[2]) { | |
let matches = songLiStr.match(/([\w\d\s,.]*) \([\dA-Za-z,\s-/\.&]*\)\s?\-(\s?([\d\w\s\/?]*)?\s(\d{4})?)?"?/); | |
// console.log(songLiStr); | |
// console.log(matches); | |
if (matches) { | |
songsList.push({name: matches[1], movie: matches[3], year: matches[4]}); | |
} | |
} | |
} | |
} | |
// // '<li>"Woh Aaj Apni Mehfil Mein Aayein Huye Hain (Solo - Ravi/Kamil Rashid) - Mehndi 1958"</li>' | |
// let songsList = songsInTextFormat.match(/Woh Aaj Apni Mehfil Mein Aayein/gim); | |
return songsList; | |
} | |
/** | |
* Given list of songs print how many songs were sung in a given year | |
* Output should print year - no.of songs | |
* | |
* Print top year where Rafi has sung most no.of songs | |
* | |
*/ | |
function analyze(songs) { | |
let summaryOfSongsData = {}; | |
songs.forEach(element => { | |
if (summaryOfSongsData[element.year]) { | |
summaryOfSongsData[element.year] += 1; | |
} else { | |
summaryOfSongsData[element.year] = 1; | |
} | |
}); | |
return summaryOfSongsData; | |
} | |
/** | |
* Main function | |
*/ | |
async function main() { | |
let inputValue = process.argv[2]; | |
if (inputValue) { | |
let songsURL = await getRafiMusicLink(inputValue); | |
let songs = await getListOfSongs(songsURL); | |
// console.table(songs); | |
let summary = await analyze(songs); | |
console.table(summary); | |
let highestSungSongInfo = {count: 1}; | |
for(let sungYear in summary) { | |
if(sungYear !== "undefined" && highestSungSongInfo.count && highestSungSongInfo.count < summary[sungYear]) { | |
highestSungSongInfo.year = sungYear; | |
highestSungSongInfo.count = summary[sungYear]; | |
} | |
} | |
console.log(`Year ${highestSungSongInfo.year} has highest of ${highestSungSongInfo.count}s sung by Rafi`); | |
} else { | |
console.warn(`Please specify the Alphabet you are searching for as command line argument`); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment