Skip to content

Instantly share code, notes, and snippets.

@pldmgg
Created August 30, 2017 11:02
Show Gist options
  • Save pldmgg/c118cdbf9226a94faad11fd1583ba6cb to your computer and use it in GitHub Desktop.
Save pldmgg/c118cdbf9226a94faad11fd1583ba6cb to your computer and use it in GitHub Desktop.
OpenScrapingTest_For_scriptcs_cli.csx
#r "C:\Windows\Microsoft.NET\Framework64\v4.0.30319\mscorlib.dll"
#r "System"
#r "System.Core"
#r ".\scriptcs_packages\System.Runtime.4.3.0\lib\net462\System.Runtime.dll"
#r ".\scriptcs_packages\System.Runtime.Extensions.4.3.0\lib\net462\System.Runtime.Extensions.dll"
#r "System.Collections"
#r ".\scriptcs_packages\Newtonsoft.Json.10.0.3\lib\netstandard1.3\Newtonsoft.Json.dll"
#r ".\scriptcs_packages\OpenScraping.1.0.1\lib\netcoreapp2.0\OpenScraping.dll"
#r ".\scriptcs_packages\HtmlAgilityPack.1.5.1\lib\netstandard1.6\HtmlAgilityPack.dll"
using System;
using Newtonsoft.Json;
using OpenScraping;
using OpenScraping.Config;
var configJson = @"
{
'title': '//h1',
'body': '//div[contains(@class, \'article\')]'
}
";
var config = StructuredDataConfig.ParseJsonString(configJson);
var html = "<html><body><h1>Article title</h1><div class='article'>Article contents</div></body></html>";
var openScraping = new StructuredDataExtractor(config);
var scrapingResults = openScraping.Extract(html);
Console.WriteLine(scrapingResults["title"]);
Console.WriteLine("----------------------------");
Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented));
Console.ReadKey();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment