Skip to content

Instantly share code, notes, and snippets.

@jamessdixon

jamessdixon/WebCrawling

Last active Dec 19, 2019
Embed
What would you like to do?
WebCrawling With F#
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.blob\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Blob.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.common\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Common.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.cosmos.table\1.0.5\lib\netstandard2.0\Microsoft.Azure.Cosmos.Table.dll"
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.documentdb.core\2.1.3\lib\netstandard1.6\Microsoft.Azure.DocumentDB.Core.dll"
open System
open System.Net
open System.Windows.Forms
open System.Collections.Generic
open Microsoft.Azure.Storage
open Microsoft.Azure.Storage.Blob
open Microsoft.Azure.Cosmos.Table
let getPdfUris (document:HtmlDocument) =
let collection = document.GetElementsByTagName("a")
collection
|> Seq.cast<HtmlElement>
|> Seq.filter(fun e -> e.OuterText = "View Report")
|> Seq.map(fun e -> e.GetAttribute("href"))
let getNextButton (document:HtmlDocument) =
let collection = document.GetElementsByTagName("a")
collection
|> Seq.cast<HtmlElement>
|> Seq.tryFind(fun e -> e.InnerText = "Next")
let invokeNextButton (nextButton: HtmlElement) =
nextButton.InvokeMember("Click") |> ignore
printfn "Next Button Invoked"
let handlePage (browser:WebBrowser) (totalUris:List<string>) =
let document = browser.Document
let uris = getPdfUris document
totalUris.AddRange(uris)
let nextButton = getNextButton document
match nextButton with
| Some b ->
invokeNextButton b
| None -> ()
//let downloadPdf (uri:string) =
// let client = new WebClient();
// let targetFilePath = @"C:\Temp\" + Guid.NewGuid().ToString() + ".pdf";
// client.DownloadFile(uri,targetFilePath)
type DocumentEntry(searchModule:string, documentId: Guid, uri: String) =
inherit TableEntity(partitionKey=searchModule, rowKey=documentId.ToString())
new() = DocumentEntry(null,Guid.Empty,null)
member val Uri = uri with get, set
let connectionString = ""
let insertBlob (documentId: Guid) (uri:string) =
let containerName = "documents"
let storageAccount = Microsoft.Azure.Storage.CloudStorageAccount.Parse(connectionString)
let client = storageAccount.CreateCloudBlobClient()
let container = client.GetContainerReference(containerName)
let blockBlob = container.GetBlockBlobReference(documentId.ToString() + ".pdf")
blockBlob.StartCopy(new Uri(uri)) |> ignore
let insertDocumentEntry (documentId: Guid) (uri:string) =
let tableName = "documents"
let storageAccount = CloudStorageAccount.Parse(connectionString)
let tableClient = storageAccount.CreateCloudTableClient()
let table = tableClient.GetTableReference(tableName)
let searchModule = "ConnecticutDepartmentOfInsurance"
let documentId = documentId
let documentEntry = DocumentEntry(searchModule, documentId, uri)
let insertOperation = TableOperation.Insert(documentEntry)
table.Execute(insertOperation) |> ignore
let handleDocument (uri:string) =
try
let documentId = Guid.NewGuid()
insertBlob documentId uri
insertDocumentEntry documentId uri
()
with e -> printfn "FAILURE: %s" e.Message
let browser = new WebBrowser()
let uris = new List<string>()
browser.DocumentCompleted.Add(fun _ -> handlePage browser uris)
let uri = "https://www.catalog.state.ct.us/cid/portalApps/examinations.aspx"
browser.Navigate(uri)
printf "Links Done"
uris |> Seq.iter(fun uri -> handleDocument uri)
printf "Downloads Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.