Skip to content

Instantly share code, notes, and snippets.

@jamessdixon

jamessdixon/WebCrawling

Last active Dec 19, 2019
Embed
What would you like to do?
WebCrawling With F#
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.blob\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Blob.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.common\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Common.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.cosmos.table\1.0.5\lib\netstandard2.0\Microsoft.Azure.Cosmos.Table.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.documentdb.core\2.1.3\lib\netstandard1.6\Microsoft.Azure.DocumentDB.Core.dll"
open System
open System.Net
open System.Windows.Forms
open System.Collections.Generic
open Microsoft.Azure.Storage
open Microsoft.Azure.Storage.Blob
open Microsoft.Azure.Cosmos.Table
let getPdfUris (document:HtmlDocument) =
let collection = document.GetElementsByTagName("a")
collection
|> Seq.cast<HtmlElement>
|> Seq.filter(fun e -> e.OuterText = "View Report")
|> Seq.map(fun e -> e.GetAttribute("href"))
let getNextButton (document:HtmlDocument) =
let collection = document.GetElementsByTagName("a")
collection
|> Seq.cast<HtmlElement>
|> Seq.tryFind(fun e -> e.InnerText = "Next")
let invokeNextButton (nextButton: HtmlElement) =
nextButton.InvokeMember("Click") |> ignore
printfn "Next Button Invoked"
let handlePage (browser:WebBrowser) (totalUris:List<string>) =
let document = browser.Document
let uris = getPdfUris document
totalUris.AddRange(uris)
let nextButton = getNextButton document
match nextButton with
| Some b ->
invokeNextButton b
| None -> ()
//let downloadPdf (uri:string) =
// let client = new WebClient();
// let targetFilePath = @"C:\Temp\" + Guid.NewGuid().ToString() + ".pdf";
// client.DownloadFile(uri,targetFilePath)
type DocumentEntry(searchModule:string, documentId: Guid, uri: String) =
inherit TableEntity(partitionKey=searchModule, rowKey=documentId.ToString())
new() = DocumentEntry(null,Guid.Empty,null)
member val Uri = uri with get, set
let connectionString = ""
let insertBlob (documentId: Guid) (uri:string) =
let containerName = "documents"
let storageAccount = Microsoft.Azure.Storage.CloudStorageAccount.Parse(connectionString)
let client = storageAccount.CreateCloudBlobClient()
let container = client.GetContainerReference(containerName)
let blockBlob = container.GetBlockBlobReference(documentId.ToString() + ".pdf")
blockBlob.StartCopy(new Uri(uri)) |> ignore
let insertDocumentEntry (documentId: Guid) (uri:string) =
let tableName = "documents"
let storageAccount = CloudStorageAccount.Parse(connectionString)
let tableClient = storageAccount.CreateCloudTableClient()
let table = tableClient.GetTableReference(tableName)
let searchModule = "ConnecticutDepartmentOfInsurance"
let documentId = documentId
let documentEntry = DocumentEntry(searchModule, documentId, uri)
let insertOperation = TableOperation.Insert(documentEntry)
table.Execute(insertOperation) |> ignore
let handleDocument (uri:string) =
try
let documentId = Guid.NewGuid()
insertBlob documentId uri
insertDocumentEntry documentId uri
()
with e -> printfn "FAILURE: %s" e.Message
let browser = new WebBrowser()
let uris = new List<string>()
browser.DocumentCompleted.Add(fun _ -> handlePage browser uris)
let uri = "https://www.catalog.state.ct.us/cid/portalApps/examinations.aspx"
browser.Navigate(uri)
printf "Links Done"
uris |> Seq.iter(fun uri -> handleDocument uri)
printf "Downloads Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment