WebCrawling With F#
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.blob\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Blob.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.storage.common\11.1.0\lib\netstandard2.0\Microsoft.Azure.Storage.Common.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.cosmos.table\1.0.5\lib\netstandard2.0\Microsoft.Azure.Cosmos.Table.dll" | |
#r @"C:\Users\DIXON2019\.nuget\packages\microsoft.azure.documentdb.core\2.1.3\lib\netstandard1.6\Microsoft.Azure.DocumentDB.Core.dll" | |
open System | |
open System.Net | |
open System.Windows.Forms | |
open System.Collections.Generic | |
open Microsoft.Azure.Storage | |
open Microsoft.Azure.Storage.Blob | |
open Microsoft.Azure.Cosmos.Table | |
let getPdfUris (document:HtmlDocument) = | |
let collection = document.GetElementsByTagName("a") | |
collection | |
|> Seq.cast<HtmlElement> | |
|> Seq.filter(fun e -> e.OuterText = "View Report") | |
|> Seq.map(fun e -> e.GetAttribute("href")) | |
let getNextButton (document:HtmlDocument) = | |
let collection = document.GetElementsByTagName("a") | |
collection | |
|> Seq.cast<HtmlElement> | |
|> Seq.tryFind(fun e -> e.InnerText = "Next") | |
let invokeNextButton (nextButton: HtmlElement) = | |
nextButton.InvokeMember("Click") |> ignore | |
printfn "Next Button Invoked" | |
let handlePage (browser:WebBrowser) (totalUris:List<string>) = | |
let document = browser.Document | |
let uris = getPdfUris document | |
totalUris.AddRange(uris) | |
let nextButton = getNextButton document | |
match nextButton with | |
| Some b -> | |
invokeNextButton b | |
| None -> () | |
//let downloadPdf (uri:string) = | |
// let client = new WebClient(); | |
// let targetFilePath = @"C:\Temp\" + Guid.NewGuid().ToString() + ".pdf"; | |
// client.DownloadFile(uri,targetFilePath) | |
type DocumentEntry(searchModule:string, documentId: Guid, uri: String) = | |
inherit TableEntity(partitionKey=searchModule, rowKey=documentId.ToString()) | |
new() = DocumentEntry(null,Guid.Empty,null) | |
member val Uri = uri with get, set | |
let connectionString = "" | |
let insertBlob (documentId: Guid) (uri:string) = | |
let containerName = "documents" | |
let storageAccount = Microsoft.Azure.Storage.CloudStorageAccount.Parse(connectionString) | |
let client = storageAccount.CreateCloudBlobClient() | |
let container = client.GetContainerReference(containerName) | |
let blockBlob = container.GetBlockBlobReference(documentId.ToString() + ".pdf") | |
blockBlob.StartCopy(new Uri(uri)) |> ignore | |
let insertDocumentEntry (documentId: Guid) (uri:string) = | |
let tableName = "documents" | |
let storageAccount = CloudStorageAccount.Parse(connectionString) | |
let tableClient = storageAccount.CreateCloudTableClient() | |
let table = tableClient.GetTableReference(tableName) | |
let searchModule = "ConnecticutDepartmentOfInsurance" | |
let documentId = documentId | |
let documentEntry = DocumentEntry(searchModule, documentId, uri) | |
let insertOperation = TableOperation.Insert(documentEntry) | |
table.Execute(insertOperation) |> ignore | |
let handleDocument (uri:string) = | |
try | |
let documentId = Guid.NewGuid() | |
insertBlob documentId uri | |
insertDocumentEntry documentId uri | |
() | |
with e -> printfn "FAILURE: %s" e.Message | |
let browser = new WebBrowser() | |
let uris = new List<string>() | |
browser.DocumentCompleted.Add(fun _ -> handlePage browser uris) | |
let uri = "https://www.catalog.state.ct.us/cid/portalApps/examinations.aspx" | |
browser.Navigate(uri) | |
printf "Links Done" | |
uris |> Seq.iter(fun uri -> handleDocument uri) | |
printf "Downloads Done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment