Last active
June 14, 2019 14:01
-
-
Save PassiveModding/78fb5983d9dbfa6ab501d5f18c3edbd8 to your computer and use it in GitHub Desktop.
Genius Lyrics Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
using System; | |
using System.Linq; | |
using System.Net.Http; | |
using System.Net.Http.Headers; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
using HtmlAgilityPack; | |
using Newtonsoft.Json.Linq; | |
*/ | |
/// <summary> | |
/// Scrapes genius lyrics | |
/// </summary> | |
/// <param name="authorization">The genius authorization token, can be found by signing into genius and viewing the docs examples ie. https://docs.genius.com/#songs-h2 </param> | |
/// <param name="query"> | |
/// Any song query | |
/// Should also return the most popular song of an artist if one is provided | |
/// </param> | |
/// <returns> | |
/// The lyrics of the first result that is returned by the genius api for the search or null | |
/// </returns> | |
public async Task<string> ScrapeGeniusLyricsAsync(string authorization, string query) | |
{ | |
try | |
{ | |
var request = new HttpRequestMessage(HttpMethod.Get, $"https://api.genius.com/search?q={query}"); | |
request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", authorization); | |
//Use the genius api to make a song query. | |
//Note that you should only need to use a single httpclient for your entire appplication | |
var search = await client.SendAsync(request); | |
if (!search.IsSuccessStatusCode) | |
{ | |
return null; | |
} | |
var token = JToken.Parse(await search.Content.ReadAsStringAsync()); | |
var hits = token.Value<JToken>("response").Value<JArray>("hits"); | |
if (!hits.HasValues) | |
{ | |
return null; | |
} | |
//Try to get the genius url of the lyrics page | |
var first = hits.First(); | |
//Access the page qualifier of the first result that was returned | |
var result = first.Value<JToken>("result"); | |
var pathStr = result.Value<JToken>("path").ToString(); | |
//Load and scrape the web page content. | |
var webHtml = await HttpClient.GetStringAsync($"https://genius.com{pathStr}"); | |
var doc = new HtmlDocument(); | |
doc.LoadHtml(webHtml); | |
//Find the lyrics node if possible | |
var lyricsDivs = doc.DocumentNode.SelectNodes("//div[contains(@class, 'lyrics')]"); | |
if (!lyricsDivs.Any()) | |
{ | |
return null; | |
} | |
var firstDiv = lyricsDivs.First(); | |
var text = firstDiv.InnerText; | |
//Filter out the spacing between verses | |
var regex2 = new Regex("\n{2}"); | |
text = regex2.Replace(text, "\n"); | |
//strip out additional parts which are prefixed with or contain only spaces | |
var regex3 = new Regex("\n +"); | |
text = regex3.Replace(text, ""); | |
//Fix up the bracketed content that are at the start of verses | |
text = text.Replace("[", "\n["); | |
text = text.Replace("&", "&", StringComparison.InvariantCultureIgnoreCase); | |
//Strip the additional genius content found at the end of the lyrics | |
var indexEnd = text.IndexOf("More on genius", StringComparison.InvariantCultureIgnoreCase); | |
if (indexEnd != -1) | |
{ | |
text = text.Substring(0, indexEnd); | |
} | |
//Remove additional whitespace at the start and end of the response | |
text = text.Trim(); | |
return text; | |
} | |
catch | |
{ | |
return null; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment