Skip to content

Instantly share code, notes, and snippets.

@Gutek
Created March 1, 2016 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gutek/70d5b5223e5c35b5bbd5 to your computer and use it in GitHub Desktop.
Save Gutek/70d5b5223e5c35b5bbd5 to your computer and use it in GitHub Desktop.
Finds RSS feeds for bogs
open System.IO
open FSharp.Data
let readLines filePath = File.ReadAllLines(filePath);;
let downloadWebAsync (links:seq<string>) =
links
|> Seq.map HtmlDocument.AsyncLoad
let castToOption (linkElements:seq<HtmlNode>) =
linkElements
|> Seq.choose (fun x ->
x.TryGetAttribute("type")
|> Option.map (fun a -> x.Attribute("href").Value(), a.Value())
)
let getLinkElements (doc:HtmlDocument) : seq<HtmlNode> =
doc.Descendants["link"]
let filterOnlyRss (_:string, t:string) =
t.Equals("application/rss+xml", System.StringComparison.OrdinalIgnoreCase)
let filterNoComments (uri:string, _:string) =
uri.ToLower().Contains("comments") = false
let getLinks =
readLines "urls.txt"
|> Seq.filter (fun f -> f.StartsWith("//") = false)
let loadAll =
getLinks
|> downloadWebAsync
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.map getLinkElements
|> Seq.map castToOption
|> Seq.map (Seq.filter (filterOnlyRss))
|> Seq.map (Seq.filter (filterNoComments))
//
[<EntryPoint>]
let main argv =
let all = loadAll
all
|> Seq.concat
|> Seq.distinct
|> Seq.iter (fun (x,y) -> printfn "%s" x)
0
@Krzysztof-Cieslak
Copy link

I guess something like that would work (not-tested). Async.Catch makes the trick, it returns F# Choice type, Choice<HtmlDocument, Exception> in this case.

open System.IO
open FSharp.Data

let readLines filePath = File.ReadAllLines(filePath);;

let downloadWebAsync (links:seq<string>) =
    links
    |> Seq.map (HtmlDocument.AsyncLoad >> Async.Catch)

let castToOption (linkElements:seq<HtmlNode>) =
    linkElements
    |> Seq.choose (fun x ->
        x.TryGetAttribute("type")
        |> Option.map (fun a -> x.Attribute("href").Value(), a.Value())
    )

let getLinkElements (doc:HtmlDocument) : seq<HtmlNode> =
    doc.Descendants["link"]

let filterOnlyRss (_:string, t:string) =
    t.Equals("application/rss+xml", System.StringComparison.OrdinalIgnoreCase)

let filterNoComments (uri:string, _:string) =
    uri.ToLower().Contains("comments") = false

let getLinks ()=
    readLines "urls.txt"
    |> Seq.filter (fun f -> f.StartsWith("//") = false)

let loadAll () =
    getLinks ()
    |> downloadWebAsync
    |> Async.Parallel
    |> Async.RunSynchronously

let findLinks =
    getLinkElements
    >> castToOption
    >> Seq.filter (filterOnlyRss)
    >> Seq.filter (filterNoComments)



[<EntryPoint>]
let main argv =

    loadAll ()
    |> Array.collect (fun n ->
        match n with
        | Choice1Of2 d -> findLinks d |> Seq.toArray
        | Choice2Of2 e ->
            printfn "FAIELD WITH: %A" e
            [||]
    )
    |> Seq.distinct
    |> Seq.iter (fun (x,y) -> printfn "%s" x)

    0

@Gutek
Copy link
Author

Gutek commented Mar 1, 2016

thanks, that works perfectly. now i need to understand whats going on there :)

@orient-man
Copy link

Krzysiek przerobił świetnie, ale dorzucę jeszcze swoje "muśnięcia":

#r "./packages/FSharp.Data/lib/net40/FSharp.Data.dll"
open System
open System.IO
open FSharp.Data

let readLines filePath = File.ReadAllLines(filePath)

let downloadWebAsync = HtmlDocument.AsyncLoad >> Async.Catch

let getLinkElements (doc:HtmlDocument) = doc.Descendants["link"]

let tryGetLinkInfo (link:HtmlNode) =
    let getHref() = link.Attribute("href").Value()
    link.TryGetAttribute("type") |> Option.map (fun t -> getHref(), t.Value())

let filterOnlyRss (_, t:string) =
    t.Equals("application/rss+xml", StringComparison.OrdinalIgnoreCase)

let filterNoComments (uri:string, _) = not (uri.ToLower().Contains("comments"))

let getLinks =
    readLines "h:/projekty/_robol/urls.txt"
    |> Seq.filter (fun f -> not (f.StartsWith("//")))

let loadAll () =
    getLinks
    |> Seq.map downloadWebAsync
    |> Async.Parallel
    |> Async.RunSynchronously

let findLinks =
    getLinkElements
    >> Seq.choose tryGetLinkInfo
    >> Seq.filter filterOnlyRss
    >> Seq.filter filterNoComments

let filterErrors = function
| Choice1Of2 d -> Some(d)
| Choice2Of2 e -> printfn "FAIELD WITH: %A" e; None

loadAll ()
|> Seq.choose filterErrors
|> Seq.collect findLinks
|> Seq.distinct
|> Seq.iter (fun (x, _) -> printfn "%s" x)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment