Skip to content

Instantly share code, notes, and snippets.

@PassiveModding
Last active June 14, 2019 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PassiveModding/78fb5983d9dbfa6ab501d5f18c3edbd8 to your computer and use it in GitHub Desktop.
Save PassiveModding/78fb5983d9dbfa6ab501d5f18c3edbd8 to your computer and use it in GitHub Desktop.
Genius Lyrics Scraper
/*
using System;
using System.Linq;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using HtmlAgilityPack;
using Newtonsoft.Json.Linq;
*/
/// <summary>
/// Scrapes genius lyrics
/// </summary>
/// <param name="authorization">The genius authorization token, can be found by signing into genius and viewing the docs examples ie. https://docs.genius.com/#songs-h2 </param>
/// <param name="query">
/// Any song query
/// Should also return the most popular song of an artist if one is provided
/// </param>
/// <returns>
/// The lyrics of the first result that is returned by the genius api for the search or null
/// </returns>
public async Task<string> ScrapeGeniusLyricsAsync(string authorization, string query)
{
try
{
var request = new HttpRequestMessage(HttpMethod.Get, $"https://api.genius.com/search?q={query}");
request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", authorization);
//Use the genius api to make a song query.
//Note that you should only need to use a single httpclient for your entire appplication
var search = await client.SendAsync(request);
if (!search.IsSuccessStatusCode)
{
return null;
}
var token = JToken.Parse(await search.Content.ReadAsStringAsync());
var hits = token.Value<JToken>("response").Value<JArray>("hits");
if (!hits.HasValues)
{
return null;
}
//Try to get the genius url of the lyrics page
var first = hits.First();
//Access the page qualifier of the first result that was returned
var result = first.Value<JToken>("result");
var pathStr = result.Value<JToken>("path").ToString();
//Load and scrape the web page content.
var webHtml = await HttpClient.GetStringAsync($"https://genius.com{pathStr}");
var doc = new HtmlDocument();
doc.LoadHtml(webHtml);
//Find the lyrics node if possible
var lyricsDivs = doc.DocumentNode.SelectNodes("//div[contains(@class, 'lyrics')]");
if (!lyricsDivs.Any())
{
return null;
}
var firstDiv = lyricsDivs.First();
var text = firstDiv.InnerText;
//Filter out the spacing between verses
var regex2 = new Regex("\n{2}");
text = regex2.Replace(text, "\n");
//strip out additional parts which are prefixed with or contain only spaces
var regex3 = new Regex("\n +");
text = regex3.Replace(text, "");
//Fix up the bracketed content that are at the start of verses
text = text.Replace("[", "\n[");
text = text.Replace("&amp;", "&", StringComparison.InvariantCultureIgnoreCase);
//Strip the additional genius content found at the end of the lyrics
var indexEnd = text.IndexOf("More on genius", StringComparison.InvariantCultureIgnoreCase);
if (indexEnd != -1)
{
text = text.Substring(0, indexEnd);
}
//Remove additional whitespace at the start and end of the response
text = text.Trim();
return text;
}
catch
{
return null;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment