Skip to content

Instantly share code, notes, and snippets.

@EmmanuelOga
Created December 27, 2016 03:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmmanuelOga/6c1cbb1556fb6881106bb0365a5792c5 to your computer and use it in GitHub Desktop.
Save EmmanuelOga/6c1cbb1556fb6881106bb0365a5792c5 to your computer and use it in GitHub Desktop.
Scraping salsa songs with F#!~
# Scraping http://salsahackers.com/dj-pick/
open System
open System.Text.RegularExpressions
open FSharp.Data
open FSharp.Data
let Show x = x.Dump(); x
let domain = "http://salsahackers.com"
let startUrl = domain + "/dj-pick/"
/// Return a function able to extract an attribute from an HtmlNode.
let attribGetter attrib =
let getter (link : HtmlNode) =
try
(link.Attribute attrib).Value()
with
| _ -> ""
getter
/// Links to each dj page.
let djUrls =
let isDjUrl (url : string) =
url.Contains "dj-pick"
&& not(url.Contains("share=email"))
&& url.Contains domain
&& url <> startUrl
let links (page : HtmlDocument) filter =
page.Descendants ["a"]
|> Seq.map (attribGetter "href")
|> Seq.filter(filter)
|> Seq.map(
fun (url : string) ->
let p = url.Split([|domain|], StringSplitOptions.RemoveEmptyEntries)
p.ElementAt(p.Length - 1).Replace("%20", ""))
links (HtmlDocument.Load(startUrl)) isDjUrl
/// Extract youtube urls from a page.
let videoUrls (path : string) =
let isYoutubeUrl (url : string) = url.Contains("youtube.com")
let djPage = HtmlDocument.Load(domain + path)
djPage.Descendants ["iframe"]
|> Seq.map (attribGetter "src")
|> Seq.filter(isYoutubeUrl)
let allVideos =
djUrls
|> Seq.map(fun path -> videoUrls path)
|> Seq.concat
|> Seq.map(fun url -> url.Replace("//", ""))
## Using above output, copy-pasted this urls....
## Yeah, I know, copy paste. But I was using linqpad for exploration :-)
let urls = """
www.youtube.com/embed/FuontJukpL0
www.youtube.com/embed/Ti2SGzb2nkc
www.youtube.com/embed/YmC5T9qdplk
www.youtube.com/embed/e9z8W21kddw
www.youtube.com/embed/zonS5MybYZs
www.youtube.com/embed/0rfvKpfCth4
www.youtube.com/embed/qFMN90bHIjA
www.youtube.com/embed/2_SORYk6wfc
www.youtube.com/embed/aGrNVazCJc8
www.youtube.com/embed/lixi7dX8Uj8
www.youtube.com/embed/NOrfzcA6jzI
www.youtube.com/embed/mLwwyCKZZdk
www.youtube.com/embed/cpYPPXf63a4
www.youtube.com/embed/AXnywtgrhYc
www.youtube.com/embed/yCxD5ni5aOg
www.youtube.com/embed/UuCKLBpI_U8
www.youtube.com/embed/DLVDjZV6kV0
www.youtube.com/embed/Dqt8x19rlGU
www.youtube.com/embed/uodvLaQ0H5k
www.youtube.com/embed/jAQMFxayXHw
www.youtube.com/embed/4XibiJUmuzM
www.youtube.com/embed/TShBHU70hVE
www.youtube.com/embed/VurAF9ioycw
www.youtube.com/embed/dkHyQh4qvTo
www.youtube.com/embed/Mfqcwb83Rkg
www.youtube.com/embed/mVS0DAqsZQM
www.youtube.com/embed/4_YBpQNh4-M
www.youtube.com/embed/XUx9Sdj8DXM
www.youtube.com/embed/EKF0jp_gs3Y
www.youtube.com/embed/mI-H9LNS9Qg
www.youtube.com/embed/ZGPal-jsris
www.youtube.com/embed/X4pRaK_Bx3g
www.youtube.com/embed/HUAMlPmXGII
www.youtube.com/embed/c4vMlD038To
www.youtube.com/embed/etCBERs0LAM
www.youtube.com/embed/2veSIbjo0g0
www.youtube.com/embed/EXPsbQHAx_0
www.youtube.com/embed/xFwbjFbvmn8
www.youtube.com/embed/3hLgxMoRpSs
www.youtube.com/embed/X6iUcbKGIrg
www.youtube.com/embed/VpFiiXeG1hc
www.youtube.com/embed/HBegogL6OZk
www.youtube.com/embed/VndJ7djo2QU
www.youtube.com/embed/oca2doa0k18
www.youtube.com/embed/NdJ4D-7mKeY
www.youtube.com/embed/k9iKIM2QhlU
www.youtube.com/embed/qdIjJYvj6W0
www.youtube.com/embed/zM5bRA74-34
www.youtube.com/embed/yJMdTeH24S0
www.youtube.com/embed/47t75g1RL4M
www.youtube.com/embed/_Vsv2BJ20lg
www.youtube.com/embed/7ROfIDs0O6M
www.youtube.com/embed/vwCDSZAX3Lo
www.youtube.com/embed/qMrgVT_Xk28
www.youtube.com/embed/uSamTXs_338
www.youtube.com/embed/c4vMlD038To
www.youtube.com/embed/Qmoy3-E5mF4
www.youtube.com/embed/9Hk5Z7OdEfM
www.youtube.com/embed/Y9O_wn2E4tM
www.youtube.com/embed/TN9qs_71gf4
www.youtube.com/embed/e6Q2Gxy_KPY
www.youtube.com/embed/QWSj1uXqm50
www.youtube.com/embed/mEt26EOSRL4
www.youtube.com/embed/ogfzzxMYCvk
www.youtube.com/embed/WArlszvt4u4
www.youtube.com/embed/4WFpjXctJFw
www.youtube.com/embed/IAdiujM0C0s
www.youtube.com/embed/SETx4HXqzyI
www.youtube.com/embed/sIjDrjEJ-LU
www.youtube.com/embed/uyvh8giWRbo
www.youtube.com/embed/lmOYAyNxHbY
www.youtube.com/embed/mSSPCyNrY6Y
www.youtube.com/embed/S4c2ynj7FnA
www.youtube.com/embed/hfxeO6O4WxI
www.youtube.com/embed/BWrugW1rXjU
www.youtube.com/embed/bfLnRe76CNk
www.youtube.com/embed/Nsm-AGAWIcQ
www.youtube.com/embed/nsks0_RubM8
www.youtube.com/embed/bpQzU2qaDqY
www.youtube.com/embed/_43zLcr-MRQ
www.youtube.com/embed/16gWwH8Vx6I
www.youtube.com/embed/Cgz-Tt2uT1s
www.youtube.com/embed/AoWXQ0EjKsM
www.youtube.com/embed/cebRHe2MdZU
www.youtube.com/embed/Yw881tXm5kA
www.youtube.com/embed/Oc1M3_IIEec
www.youtube.com/embed/9Hk5Z7OdEfM
www.youtube.com/embed/YMab5ngknCM
www.youtube.com/embed/aOkrdFwQeGk
www.youtube.com/embed/OBR9jhSbsKI
www.youtube.com/embed/Talha1Dxrkc
www.youtube.com/embed/9Hk5Z7OdEfM
www.youtube.com/embed/AXrlH1HbGxs
www.youtube.com/embed/JQzAtTo2R5k
www.youtube.com/embed/Y9O_wn2E4tM
www.youtube.com/embed/Sm-2Rv5Amss
www.youtube.com/embed/8cbXqZDBNag
www.youtube.com/embed/6XB_EXfEpyQ
www.youtube.com/embed/FPgmWk4gUUI
www.youtube.com/embed/RqLLl4reKLE
www.youtube.com/embed/cebRHe2MdZU
www.youtube.com/embed/SAlmxscDqBA
www.youtube.com/embed/YJsq9ZIb_zs
www.youtube.com/embed/IqiBA8XQB_k
www.youtube.com/embed/jSKfAsUYPcA
www.youtube.com/embed/NF9ldBwg3Xg
www.youtube.com/embed/kM7EhANWK18
www.youtube.com/embed/JEbTnatNCUw
www.youtube.com/embed/r6nwALqxPko
www.youtube.com/embed/xA8Djwp3WCA
www.youtube.com/embed/Yw881tXm5kA
www.youtube.com/embed/j8y_r70do7E
www.youtube.com/embed/6jDU91ROm_k
www.youtube.com/embed/YXWOxk0K2Xc
www.youtube.com/embed/nHl_VKJjdCc
www.youtube.com/embed/_nVATHljm94
www.youtube.com/embed/Yw_SoH92QRA
www.youtube.com/embed/hwjY6IXAcJE
www.youtube.com/embed/1j3QHyBOyhg
www.youtube.com/embed/xsIfOKQGkD4
www.youtube.com/embed/z2EAeLussUQ
www.youtube.com/embed/0KAwQlOyzfg
www.youtube.com/embed/zkIsQdy73Vw
www.youtube.com/embed/CvOsXsFa5CU
www.youtube.com/embed/2Cngz0LEBaw
www.youtube.com/embed/cNbdc_J9cvM
www.youtube.com/embed/WEB43yz35uo
www.youtube.com/embed/7qIbm2eHMks
www.youtube.com/embed/o0M5I-nXD54
www.youtube.com/embed/9Hk5Z7OdEfM
www.youtube.com/embed/Sis1IJGLR7c
www.youtube.com/embed/54lqm5qTJCA
www.youtube.com/embed/X_7_nyeho8I
www.youtube.com/embed/kM7EhANWK18
www.youtube.com/embed/AzEXmGoJdRU
www.youtube.com/embed/fR3Z6Xex56c
www.youtube.com/embed/IVsNM1aDTos
www.youtube.com/embed/zE6Am3xKWUk
www.youtube.com/embed/9V9h8m2DPGI
www.youtube.com/embed/djuq-hQfj1k
www.youtube.com/embed/OVK2FTGaZ7k
www.youtube.com/embed/db-SJ8Te4do
www.youtube.com/embed/9tJCw0Rdxwc
www.youtube.com/embed/WVYx2fcYJ48
www.youtube.com/embed/qpXdFNfAjww
www.youtube.com/embed/y3x2pEYhywQ
www.youtube.com/embed/18-TC_B5a4Y
www.youtube.com/embed/WEo5fVM87-k
www.youtube.com/embed/47t75g1RL4M
www.youtube.com/embed/TN9qs_71gf4
www.youtube.com/embed/lhMmiX48BKk
www.youtube.com/embed/S4c2ynj7FnA
www.youtube.com/embed/Qmoy3-E5mF4
www.youtube.com/embed/ekjfLHk2Evc
www.youtube.com/embed/xvvfvi-cypY
www.youtube.com/embed/mFPg6hUHEDo
www.youtube.com/embed/dNBAtLaOybQ
www.youtube.com/embed/cx0BYvAtrwI
www.youtube.com/embed/I5fL7azJCm0
www.youtube.com/embed/3sGephFkons
www.youtube.com/embed/9w3XfRO-8mw
www.youtube.com/embed/Be2xAMLYWFo
www.youtube.com/embed/VVVz702mcsE
www.youtube.com/embed/VZLzvll_ZAo
www.youtube.com/embed/6QEoLCMLXQg
www.youtube.com/embed/QB04hw_7sn0
www.youtube.com/embed/mcQozVT89b4
www.youtube.com/embed/eDi-Aq1D63A
www.youtube.com/embed/Ac1WoNUeVS4
www.youtube.com/embed/cHoHVyQo2pM
www.youtube.com/embed/uTfnoDEDLlk
www.youtube.com/embed/lOjO9aJXzUo
www.youtube.com/embed/vWRWFgPNK0g
www.youtube.com/embed/62RSfIBzID8
www.youtube.com/embed/DXWnZFS_nnI
www.youtube.com/embed/iyVjN5mgpAQ
www.youtube.com/embed/wPet4QofrIc
www.youtube.com/embed/L_tdq_GrGJs
www.youtube.com/embed/9oschp6dRlQ
www.youtube.com/embed/9RIokpqlhUM
www.youtube.com/embed/zJ5VpiENqfE
www.youtube.com/embed/Ku5s5C1DnEY
www.youtube.com/embed/FDimmgpVnkc
www.youtube.com/embed/maG9QWQaVEM
www.youtube.com/embed/4UExKAdwq70
www.youtube.com/embed/T7JYJ1hECgk
www.youtube.com/embed/l1wBJgD4F5Y
www.youtube.com/embed/uWxGZkZerLA
www.youtube.com/embed/oG88pUcHCwU
www.youtube.com/embed/u5tubn74_YY
"""
let split (str : string) (sep : string) = str.Split([| sep |], StringSplitOptions.RemoveEmptyEntries)
let last (x : string []) = x.[x.Length - 1]
let videoUrls =
(split urls "\n")
|> Array.toSeq
|> Seq.filter (fun str -> str.Contains("youtube"))
|> Seq.map (fun url -> last (split url "/"))
|> Seq.map (fun id -> ("http://youtube.com/watch?v=" + id).Trim())
|> Seq.toList
type VideoInfo =
{ Url : string
Title : string
Views : int
Pluses : int }
let f (x : seq<HtmlNode>) =
let l = (x |> Seq.toList)
if l.IsEmpty then ""
else l.Head.InnerText()
let empty url = {Url = url; Title = ""; Views = 0; Pluses = 0}
let scrape (videoUrl : string) =
try
let y : HtmlDocument = HtmlDocument.Load(videoUrl)
let parseInt (str : string) =
let mutable res = []
let m = Regex.Match(str, "([0-9,]+)")
for g in m.Captures do res <- g.Value :: res
(List.rev res |> String.concat "").Replace(",", "").AsInteger()
try
let title = (y.CssSelect("#eow-title") |> f)
let views = (y.CssSelect(".watch-view-count") |> f)
let pluses = y.CssSelect(".like-button-renderer").CssSelect(".yt-uix-button-content") |> f
{ Url = videoUrl
Title = title.Trim();
Views = parseInt(views);
Pluses = int(pluses) }
with
_ -> empty videoUrl
with
_ -> empty videoUrl
[<EntryPoint>]
let main argv =
use file = System.IO.File.CreateText(@"d:\workspace\salsa.csv")
fprintfn file "Title\tUrl\tPluses\tViews"
for url in videoUrls do
Console.WriteLine("Requesting " + url)
let r = scrape url
fprintfn file "%s\t%s\t%d\t%d" r.Title r.Url r.Pluses r.Views
0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment