Skip to content

Instantly share code, notes, and snippets.

@bvenkatr
Created July 29, 2019 11:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bvenkatr/f588f1f655a86f190532e9baf43ee960 to your computer and use it in GitHub Desktop.
Save bvenkatr/f588f1f655a86f190532e9baf43ee960 to your computer and use it in GitHub Desktop.
scraping Mohammed_Rafi songs
/**
* Get the URL for Rafi's song list that matches with your first letter of your name
* Main songs list url: https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Mohammed_Rafi
* Make a request to get data from the given URL and find the link that is relavent for
* list of songs
*
* @params startLetter - This is the letter to get the link for
* @return URL for list of songs matches with first letter of your name
*/
let fetch = require("node-fetch");
var cheerio = require('cheerio');
async function getRafiMusicLink(startLetter) {
let songLinksDocument = await fetch(`https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Mohammed_Rafi`);
let songLinksDocumentInTextFormat = await songLinksDocument.text();
let listOfSongsTags = songLinksDocumentInTextFormat.match(/<a href="(\/wiki\/List_of_songs_recorded_by_Mohammed_Rafi_(\([A-Z]\)|\(B%E2%80%93C\)|\(D%E2%80%93F\)|\(H%E2%80%93I\)|\(P%E2%80%93R\)|\(U%E2%80%93Z\)))/gm);
let textToMatch = `(${startLetter})`;
console.log(`The letter to get the songs link is ${textToMatch}`);
if (["B", "C"].indexOf(startLetter) !== -1) {
textToMatch = `(B%E2%80%93C)`;
} else if (["D", "E", "F"].indexOf(startLetter) !== -1) {
textToMatch = `(D%E2%80%93F)`;
} else if (["H", "I"].indexOf(startLetter) !== -1) {
textToMatch = `(H%E2%80%93I)`;
} else if (["P", "Q", "R"].indexOf(startLetter) !== -1) {
textToMatch = `(P%E2%80%93R)`;
} else if (["U", "V", "W", "X", "Y", "Z"].indexOf(startLetter) !== -1) {
textToMatch = `(U%E2%80%93Z)`;
}
console.log(`The letter to get the songs link is ${textToMatch}`);
for (let urlText of listOfSongsTags) {
if (urlText.indexOf(textToMatch) !== -1) {
let url = urlText.match(/<a href="(.*)$/);
return url ? `https://en.wikipedia.org${url[1]}` : null;
}
}
}
/**
* Give the URL for list of songs get all songs that are listed in the URL
*
* Parse the data comes from the URL and create an array of song object like:
* {"name": "Name of the song", "year": year, "movie": "Movie of the song"}
*
*
* @return Array of songs
*/
async function getListOfSongs(songsURL) {
console.log(`Getting songs for url ${songsURL}`);
let songsDocument = await fetch(songsURL);
let songsInTextFormat = await songsDocument.text();
var $ = cheerio.load(songsInTextFormat);
console.log("gettign songs....");
let songsList = [];
for (let songsLiEle of $('.mw-parser-output ul > li').toArray()) {
let songLiStr = songsLiEle.children[0].data;
if (songLiStr) {
if (songLiStr[1] === process.argv[2]) {
let matches = songLiStr.match(/([\w\d\s,.]*) \([\dA-Za-z,\s-/\.&]*\)\s?\-(\s?([\d\w\s\/?]*)?\s(\d{4})?)?"?/);
// console.log(songLiStr);
// console.log(matches);
if (matches) {
songsList.push({name: matches[1], movie: matches[3], year: matches[4]});
}
}
}
}
// // '<li>"Woh Aaj Apni Mehfil Mein Aayein Huye Hain (Solo - Ravi/Kamil Rashid) - Mehndi 1958"</li>'
// let songsList = songsInTextFormat.match(/Woh Aaj Apni Mehfil Mein Aayein/gim);
return songsList;
}
/**
* Given list of songs print how many songs were sung in a given year
* Output should print year - no.of songs
*
* Print top year where Rafi has sung most no.of songs
*
*/
function analyze(songs) {
let summaryOfSongsData = {};
songs.forEach(element => {
if (summaryOfSongsData[element.year]) {
summaryOfSongsData[element.year] += 1;
} else {
summaryOfSongsData[element.year] = 1;
}
});
return summaryOfSongsData;
}
/**
* Main function
*/
async function main() {
let inputValue = process.argv[2];
if (inputValue) {
let songsURL = await getRafiMusicLink(inputValue);
let songs = await getListOfSongs(songsURL);
// console.table(songs);
let summary = await analyze(songs);
console.table(summary);
let highestSungSongInfo = {count: 1};
for(let sungYear in summary) {
if(sungYear !== "undefined" && highestSungSongInfo.count && highestSungSongInfo.count < summary[sungYear]) {
highestSungSongInfo.year = sungYear;
highestSungSongInfo.count = summary[sungYear];
}
}
console.log(`Year ${highestSungSongInfo.year} has highest of ${highestSungSongInfo.count}s sung by Rafi`);
} else {
console.warn(`Please specify the Alphabet you are searching for as command line argument`);
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment