WebCrawling With F#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.blob\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Blob.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.common\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Common.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.cosmos.table\1.0.5\lib\netstandard2.0\Microsoft.Azure.Cosmos.Table.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.documentdb.core\2.1.3\lib\netstandard1.6\Microsoft.Azure.DocumentDB.Core.dll" | |
open System | |
open System.Net | |
open System.Windows.Forms | |
open System.Collections.Generic | |
open Microsoft.Azure.Storage | |
open Microsoft.Azure.Storage.Blob | |
open Microsoft.Azure.Cosmos.Table | |
let getPdfUris (document:HtmlDocument) = | |
let collection = document.GetElementsByTagName("a") | |
collection | |
|> Seq.cast<HtmlElement> | |
|> Seq.filter(fun e -> e.OuterText = "View Report") | |
|> Seq.map(fun e -> e.GetAttribute("href")) | |
let getNextButton (document:HtmlDocument) = | |
let collection = document.GetElementsByTagName("a") | |
collection | |
|> Seq.cast<HtmlElement> | |
|> Seq.tryFind(fun e -> e.InnerText = "Next") | |
let invokeNextButton (nextButton: HtmlElement) = | |
nextButton.InvokeMember("Click") |> ignore | |
printfn "Next Button Invoked" | |
let handlePage (browser:WebBrowser) (totalUris:List<string>) = | |
let document = browser.Document | |
let uris = getPdfUris document | |
totalUris.AddRange(uris) | |
let nextButton = getNextButton document | |
match nextButton with | |
| Some b -> | |
invokeNextButton b | |
| None -> () | |
//let downloadPdf (uri:string) = | |
// let client = new WebClient(); | |
// let targetFilePath = @"C:\Temp\" + Guid.NewGuid().ToString() + ".pdf"; | |
// client.DownloadFile(uri,targetFilePath) | |
type DocumentEntry(searchModule:string, documentId: Guid, uri: String) = | |
inherit TableEntity(partitionKey=searchModule, rowKey=documentId.ToString()) | |
new() = DocumentEntry(null,Guid.Empty,null) | |
member val Uri = uri with get, set | |
let connectionString = "" | |
let insertBlob (documentId: Guid) (uri:string) = | |
let containerName = "documents" | |
let storageAccount = Microsoft.Azure.Storage.CloudStorageAccount.Parse(connectionString) | |
let client = storageAccount.CreateCloudBlobClient() | |
let container = client.GetContainerReference(containerName) | |
let blockBlob = container.GetBlockBlobReference(documentId.ToString() + ".pdf") | |
blockBlob.StartCopy(new Uri(uri)) |> ignore | |
let insertDocumentEntry (documentId: Guid) (uri:string) = | |
let tableName = "documents" | |
let storageAccount = CloudStorageAccount.Parse(connectionString) | |
let tableClient = storageAccount.CreateCloudTableClient() | |
let table = tableClient.GetTableReference(tableName) | |
let searchModule = "ConnecticutDepartmentOfInsurance" | |
let documentId = documentId | |
let documentEntry = DocumentEntry(searchModule, documentId, uri) | |
let insertOperation = TableOperation.Insert(documentEntry) | |
table.Execute(insertOperation) |> ignore | |
let handleDocument (uri:string) = | |
try | |
let documentId = Guid.NewGuid() | |
insertBlob documentId uri | |
insertDocumentEntry documentId uri | |
() | |
with e -> printfn "FAILURE: %s" e.Message | |
let browser = new WebBrowser() | |
let uris = new List<string>() | |
browser.DocumentCompleted.Add(fun _ -> handlePage browser uris) | |
let uri = "https://www.catalog.state.ct.us/cid/portalApps/examinations.aspx" | |
browser.Navigate(uri) | |
printf "Links Done" | |
uris |> Seq.iter(fun uri -> handleDocument uri) | |
printf "Downloads Done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment