Created
December 27, 2016 03:00
-
-
Save EmmanuelOga/6c1cbb1556fb6881106bb0365a5792c5 to your computer and use it in GitHub Desktop.
Scraping salsa songs with F#!~
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scraping http://salsahackers.com/dj-pick/ | |
open System | |
open System.Text.RegularExpressions | |
open FSharp.Data | |
open FSharp.Data | |
let Show x = x.Dump(); x | |
let domain = "http://salsahackers.com" | |
let startUrl = domain + "/dj-pick/" | |
/// Return a function able to extract an attribute from an HtmlNode. | |
let attribGetter attrib = | |
let getter (link : HtmlNode) = | |
try | |
(link.Attribute attrib).Value() | |
with | |
| _ -> "" | |
getter | |
/// Links to each dj page. | |
let djUrls = | |
let isDjUrl (url : string) = | |
url.Contains "dj-pick" | |
&& not(url.Contains("share=email")) | |
&& url.Contains domain | |
&& url <> startUrl | |
let links (page : HtmlDocument) filter = | |
page.Descendants ["a"] | |
|> Seq.map (attribGetter "href") | |
|> Seq.filter(filter) | |
|> Seq.map( | |
fun (url : string) -> | |
let p = url.Split([|domain|], StringSplitOptions.RemoveEmptyEntries) | |
p.ElementAt(p.Length - 1).Replace("%20", "")) | |
links (HtmlDocument.Load(startUrl)) isDjUrl | |
/// Extract youtube urls from a page. | |
let videoUrls (path : string) = | |
let isYoutubeUrl (url : string) = url.Contains("youtube.com") | |
let djPage = HtmlDocument.Load(domain + path) | |
djPage.Descendants ["iframe"] | |
|> Seq.map (attribGetter "src") | |
|> Seq.filter(isYoutubeUrl) | |
let allVideos = | |
djUrls | |
|> Seq.map(fun path -> videoUrls path) | |
|> Seq.concat | |
|> Seq.map(fun url -> url.Replace("//", "")) | |
## Using above output, copy-pasted this urls.... | |
## Yeah, I know, copy paste. But I was using linqpad for exploration :-) | |
let urls = """ | |
www.youtube.com/embed/FuontJukpL0 | |
www.youtube.com/embed/Ti2SGzb2nkc | |
www.youtube.com/embed/YmC5T9qdplk | |
www.youtube.com/embed/e9z8W21kddw | |
www.youtube.com/embed/zonS5MybYZs | |
www.youtube.com/embed/0rfvKpfCth4 | |
www.youtube.com/embed/qFMN90bHIjA | |
www.youtube.com/embed/2_SORYk6wfc | |
www.youtube.com/embed/aGrNVazCJc8 | |
www.youtube.com/embed/lixi7dX8Uj8 | |
www.youtube.com/embed/NOrfzcA6jzI | |
www.youtube.com/embed/mLwwyCKZZdk | |
www.youtube.com/embed/cpYPPXf63a4 | |
www.youtube.com/embed/AXnywtgrhYc | |
www.youtube.com/embed/yCxD5ni5aOg | |
www.youtube.com/embed/UuCKLBpI_U8 | |
www.youtube.com/embed/DLVDjZV6kV0 | |
www.youtube.com/embed/Dqt8x19rlGU | |
www.youtube.com/embed/uodvLaQ0H5k | |
www.youtube.com/embed/jAQMFxayXHw | |
www.youtube.com/embed/4XibiJUmuzM | |
www.youtube.com/embed/TShBHU70hVE | |
www.youtube.com/embed/VurAF9ioycw | |
www.youtube.com/embed/dkHyQh4qvTo | |
www.youtube.com/embed/Mfqcwb83Rkg | |
www.youtube.com/embed/mVS0DAqsZQM | |
www.youtube.com/embed/4_YBpQNh4-M | |
www.youtube.com/embed/XUx9Sdj8DXM | |
www.youtube.com/embed/EKF0jp_gs3Y | |
www.youtube.com/embed/mI-H9LNS9Qg | |
www.youtube.com/embed/ZGPal-jsris | |
www.youtube.com/embed/X4pRaK_Bx3g | |
www.youtube.com/embed/HUAMlPmXGII | |
www.youtube.com/embed/c4vMlD038To | |
www.youtube.com/embed/etCBERs0LAM | |
www.youtube.com/embed/2veSIbjo0g0 | |
www.youtube.com/embed/EXPsbQHAx_0 | |
www.youtube.com/embed/xFwbjFbvmn8 | |
www.youtube.com/embed/3hLgxMoRpSs | |
www.youtube.com/embed/X6iUcbKGIrg | |
www.youtube.com/embed/VpFiiXeG1hc | |
www.youtube.com/embed/HBegogL6OZk | |
www.youtube.com/embed/VndJ7djo2QU | |
www.youtube.com/embed/oca2doa0k18 | |
www.youtube.com/embed/NdJ4D-7mKeY | |
www.youtube.com/embed/k9iKIM2QhlU | |
www.youtube.com/embed/qdIjJYvj6W0 | |
www.youtube.com/embed/zM5bRA74-34 | |
www.youtube.com/embed/yJMdTeH24S0 | |
www.youtube.com/embed/47t75g1RL4M | |
www.youtube.com/embed/_Vsv2BJ20lg | |
www.youtube.com/embed/7ROfIDs0O6M | |
www.youtube.com/embed/vwCDSZAX3Lo | |
www.youtube.com/embed/qMrgVT_Xk28 | |
www.youtube.com/embed/uSamTXs_338 | |
www.youtube.com/embed/c4vMlD038To | |
www.youtube.com/embed/Qmoy3-E5mF4 | |
www.youtube.com/embed/9Hk5Z7OdEfM | |
www.youtube.com/embed/Y9O_wn2E4tM | |
www.youtube.com/embed/TN9qs_71gf4 | |
www.youtube.com/embed/e6Q2Gxy_KPY | |
www.youtube.com/embed/QWSj1uXqm50 | |
www.youtube.com/embed/mEt26EOSRL4 | |
www.youtube.com/embed/ogfzzxMYCvk | |
www.youtube.com/embed/WArlszvt4u4 | |
www.youtube.com/embed/4WFpjXctJFw | |
www.youtube.com/embed/IAdiujM0C0s | |
www.youtube.com/embed/SETx4HXqzyI | |
www.youtube.com/embed/sIjDrjEJ-LU | |
www.youtube.com/embed/uyvh8giWRbo | |
www.youtube.com/embed/lmOYAyNxHbY | |
www.youtube.com/embed/mSSPCyNrY6Y | |
www.youtube.com/embed/S4c2ynj7FnA | |
www.youtube.com/embed/hfxeO6O4WxI | |
www.youtube.com/embed/BWrugW1rXjU | |
www.youtube.com/embed/bfLnRe76CNk | |
www.youtube.com/embed/Nsm-AGAWIcQ | |
www.youtube.com/embed/nsks0_RubM8 | |
www.youtube.com/embed/bpQzU2qaDqY | |
www.youtube.com/embed/_43zLcr-MRQ | |
www.youtube.com/embed/16gWwH8Vx6I | |
www.youtube.com/embed/Cgz-Tt2uT1s | |
www.youtube.com/embed/AoWXQ0EjKsM | |
www.youtube.com/embed/cebRHe2MdZU | |
www.youtube.com/embed/Yw881tXm5kA | |
www.youtube.com/embed/Oc1M3_IIEec | |
www.youtube.com/embed/9Hk5Z7OdEfM | |
www.youtube.com/embed/YMab5ngknCM | |
www.youtube.com/embed/aOkrdFwQeGk | |
www.youtube.com/embed/OBR9jhSbsKI | |
www.youtube.com/embed/Talha1Dxrkc | |
www.youtube.com/embed/9Hk5Z7OdEfM | |
www.youtube.com/embed/AXrlH1HbGxs | |
www.youtube.com/embed/JQzAtTo2R5k | |
www.youtube.com/embed/Y9O_wn2E4tM | |
www.youtube.com/embed/Sm-2Rv5Amss | |
www.youtube.com/embed/8cbXqZDBNag | |
www.youtube.com/embed/6XB_EXfEpyQ | |
www.youtube.com/embed/FPgmWk4gUUI | |
www.youtube.com/embed/RqLLl4reKLE | |
www.youtube.com/embed/cebRHe2MdZU | |
www.youtube.com/embed/SAlmxscDqBA | |
www.youtube.com/embed/YJsq9ZIb_zs | |
www.youtube.com/embed/IqiBA8XQB_k | |
www.youtube.com/embed/jSKfAsUYPcA | |
www.youtube.com/embed/NF9ldBwg3Xg | |
www.youtube.com/embed/kM7EhANWK18 | |
www.youtube.com/embed/JEbTnatNCUw | |
www.youtube.com/embed/r6nwALqxPko | |
www.youtube.com/embed/xA8Djwp3WCA | |
www.youtube.com/embed/Yw881tXm5kA | |
www.youtube.com/embed/j8y_r70do7E | |
www.youtube.com/embed/6jDU91ROm_k | |
www.youtube.com/embed/YXWOxk0K2Xc | |
www.youtube.com/embed/nHl_VKJjdCc | |
www.youtube.com/embed/_nVATHljm94 | |
www.youtube.com/embed/Yw_SoH92QRA | |
www.youtube.com/embed/hwjY6IXAcJE | |
www.youtube.com/embed/1j3QHyBOyhg | |
www.youtube.com/embed/xsIfOKQGkD4 | |
www.youtube.com/embed/z2EAeLussUQ | |
www.youtube.com/embed/0KAwQlOyzfg | |
www.youtube.com/embed/zkIsQdy73Vw | |
www.youtube.com/embed/CvOsXsFa5CU | |
www.youtube.com/embed/2Cngz0LEBaw | |
www.youtube.com/embed/cNbdc_J9cvM | |
www.youtube.com/embed/WEB43yz35uo | |
www.youtube.com/embed/7qIbm2eHMks | |
www.youtube.com/embed/o0M5I-nXD54 | |
www.youtube.com/embed/9Hk5Z7OdEfM | |
www.youtube.com/embed/Sis1IJGLR7c | |
www.youtube.com/embed/54lqm5qTJCA | |
www.youtube.com/embed/X_7_nyeho8I | |
www.youtube.com/embed/kM7EhANWK18 | |
www.youtube.com/embed/AzEXmGoJdRU | |
www.youtube.com/embed/fR3Z6Xex56c | |
www.youtube.com/embed/IVsNM1aDTos | |
www.youtube.com/embed/zE6Am3xKWUk | |
www.youtube.com/embed/9V9h8m2DPGI | |
www.youtube.com/embed/djuq-hQfj1k | |
www.youtube.com/embed/OVK2FTGaZ7k | |
www.youtube.com/embed/db-SJ8Te4do | |
www.youtube.com/embed/9tJCw0Rdxwc | |
www.youtube.com/embed/WVYx2fcYJ48 | |
www.youtube.com/embed/qpXdFNfAjww | |
www.youtube.com/embed/y3x2pEYhywQ | |
www.youtube.com/embed/18-TC_B5a4Y | |
www.youtube.com/embed/WEo5fVM87-k | |
www.youtube.com/embed/47t75g1RL4M | |
www.youtube.com/embed/TN9qs_71gf4 | |
www.youtube.com/embed/lhMmiX48BKk | |
www.youtube.com/embed/S4c2ynj7FnA | |
www.youtube.com/embed/Qmoy3-E5mF4 | |
www.youtube.com/embed/ekjfLHk2Evc | |
www.youtube.com/embed/xvvfvi-cypY | |
www.youtube.com/embed/mFPg6hUHEDo | |
www.youtube.com/embed/dNBAtLaOybQ | |
www.youtube.com/embed/cx0BYvAtrwI | |
www.youtube.com/embed/I5fL7azJCm0 | |
www.youtube.com/embed/3sGephFkons | |
www.youtube.com/embed/9w3XfRO-8mw | |
www.youtube.com/embed/Be2xAMLYWFo | |
www.youtube.com/embed/VVVz702mcsE | |
www.youtube.com/embed/VZLzvll_ZAo | |
www.youtube.com/embed/6QEoLCMLXQg | |
www.youtube.com/embed/QB04hw_7sn0 | |
www.youtube.com/embed/mcQozVT89b4 | |
www.youtube.com/embed/eDi-Aq1D63A | |
www.youtube.com/embed/Ac1WoNUeVS4 | |
www.youtube.com/embed/cHoHVyQo2pM | |
www.youtube.com/embed/uTfnoDEDLlk | |
www.youtube.com/embed/lOjO9aJXzUo | |
www.youtube.com/embed/vWRWFgPNK0g | |
www.youtube.com/embed/62RSfIBzID8 | |
www.youtube.com/embed/DXWnZFS_nnI | |
www.youtube.com/embed/iyVjN5mgpAQ | |
www.youtube.com/embed/wPet4QofrIc | |
www.youtube.com/embed/L_tdq_GrGJs | |
www.youtube.com/embed/9oschp6dRlQ | |
www.youtube.com/embed/9RIokpqlhUM | |
www.youtube.com/embed/zJ5VpiENqfE | |
www.youtube.com/embed/Ku5s5C1DnEY | |
www.youtube.com/embed/FDimmgpVnkc | |
www.youtube.com/embed/maG9QWQaVEM | |
www.youtube.com/embed/4UExKAdwq70 | |
www.youtube.com/embed/T7JYJ1hECgk | |
www.youtube.com/embed/l1wBJgD4F5Y | |
www.youtube.com/embed/uWxGZkZerLA | |
www.youtube.com/embed/oG88pUcHCwU | |
www.youtube.com/embed/u5tubn74_YY | |
""" | |
let split (str : string) (sep : string) = str.Split([| sep |], StringSplitOptions.RemoveEmptyEntries) | |
let last (x : string []) = x.[x.Length - 1] | |
let videoUrls = | |
(split urls "\n") | |
|> Array.toSeq | |
|> Seq.filter (fun str -> str.Contains("youtube")) | |
|> Seq.map (fun url -> last (split url "/")) | |
|> Seq.map (fun id -> ("http://youtube.com/watch?v=" + id).Trim()) | |
|> Seq.toList | |
type VideoInfo = | |
{ Url : string | |
Title : string | |
Views : int | |
Pluses : int } | |
let f (x : seq<HtmlNode>) = | |
let l = (x |> Seq.toList) | |
if l.IsEmpty then "" | |
else l.Head.InnerText() | |
let empty url = {Url = url; Title = ""; Views = 0; Pluses = 0} | |
let scrape (videoUrl : string) = | |
try | |
let y : HtmlDocument = HtmlDocument.Load(videoUrl) | |
let parseInt (str : string) = | |
let mutable res = [] | |
let m = Regex.Match(str, "([0-9,]+)") | |
for g in m.Captures do res <- g.Value :: res | |
(List.rev res |> String.concat "").Replace(",", "").AsInteger() | |
try | |
let title = (y.CssSelect("#eow-title") |> f) | |
let views = (y.CssSelect(".watch-view-count") |> f) | |
let pluses = y.CssSelect(".like-button-renderer").CssSelect(".yt-uix-button-content") |> f | |
{ Url = videoUrl | |
Title = title.Trim(); | |
Views = parseInt(views); | |
Pluses = int(pluses) } | |
with | |
_ -> empty videoUrl | |
with | |
_ -> empty videoUrl | |
[<EntryPoint>] | |
let main argv = | |
use file = System.IO.File.CreateText(@"d:\workspace\salsa.csv") | |
fprintfn file "Title\tUrl\tPluses\tViews" | |
for url in videoUrls do | |
Console.WriteLine("Requesting " + url) | |
let r = scrape url | |
fprintfn file "%s\t%s\t%d\t%d" r.Title r.Url r.Pluses r.Views | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Result:
https://docs.google.com/spreadsheets/d/1Y85hp768AHCJmx6zwQrk9kwDdRnCmbZIV9ULGhu3Heg/pubhtml?gid=373290054&single=true