Stéfan Sinclair sgsinclair

## cablegate html to text
# *Simple* script by Stéfan Sinclair to extract text from Wikileaks Cablegate
#
# Usage:
#  ruby cablegateHtml2text [input_directory] [output_directory]
#
# If input directory and output directory are not specified, the current directory is used.

# process a directory recursively
def process_directory(input_directory, output_directory)
  Dir.new(input_directory).each do |file|

## bibsort.xsl
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0">
    <xsl:output method="html"/>
    <xsl:template match="/">
        <xsl:param name="sortField"/>
        <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
            <head>
                <title>Bibliography</title>
            </head>

## OldBaileyDocuments.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?>
<documents>
<document href="ob/16740429.xml" />
<document href="ob/16740717.xml" />
<!-- and a couple of thousand more documents -->
</documents>

## gist:3785028
<div class="post-107 [...]">
  <h2 class="entry-title">Interview with Dr. Timothy Powell</h2>
  <div class="post-info">
    <span class="date published time" title="2012-09-18T16:06:08+00:00">September 18, 2012</span>
    By <span class="author vcard">Elizabeth Couchum</a></span>
  </div>
  <div class="entry-content">
    <p>Blog of Dr. Timothy Powell [...]</p>
  </div>
</div>

## gist:3785235
<item>
  <title>Interview with Dr. Timothy Powell</title>
  <pubDate>Tue, 18 Sep 2012 16:06:08 +0000</pubDate>
  <dc:creator>Elizabeth Couchum</dc:creator>
  <content:encoded><![CDATA[<p>Blog of Dr. Timothy Powell [...]</p> ]]></content:encoded>
</item>

## gist:3785350
<?php

/*** BEGIN a set of parameters that can easily be set ***/

// the base URL (without trailing slash)
$url = 'http://digitally.doinghistory.com/category/digital-humanist-interview';

// the regex that defines the contents of the post – the entire expression is kept
$post_regex = "/<div class=\"post-(\d+)\s.+?<!-- end \.entry-content -->/s";

## gist:3785665
{
    "status": "OK",
    "usage": "By accessing AlchemyAPI [...]",
    "url": "",
    "language": "english",
    "docSentiment": {
        "type": "negative",
        "score": "-0.248111"
    }
}

## gist:3785704
<?php

/*** BEGIN a set of parameters that can easily be set ***/

$apikey = ''; // *** YOU NEED TO SET THIS TO SOMETHING VALID ***/

// input directory (where source files are located)
$input_directory = dirname(__FILE__) . '/harvested_html';

/*** END a set of parameters that can easily be set ***/

## gist:3788724
<?php

/*** BEGIN a set of parameters that can easily be set ***/

// input directory (where source files are located)
$input_directory = dirname(__FILE__) . '/harvested_html';

// we''ll define this manually so that we keep the proper ordering
$files = array(
	"Digital Humanism in the Last Frontier 9696.html",

## gist:3789252
Digital Humanism in the Last Frontier 9696.html: people (12), digital (8), humanism (6), humanist (5), nbsp (5), alaska (5), lot (5), native (4), education (4), challenges (4), corporate (4), health (4), andrew (4), traditional (4), time (3), biggest (3), working (3), specifically (3), personal (3), rural (3), beliefs (3), responsibilities (2), political (2), channels (2), set (2), sort (2), water (2), view (2), hard (2), perfectly (2), compromise (2), environment (2), support (2), frontier (2), involved (2), official (2), day (2), living (2), creativity (2), government (2), conciliatory (2), humans (2), culture (2), tribal (2), work (2), worked (2), anymore (2), recently (2), consortium (2), communications (2), food (2), end (2), good (2), dialogue (2), liaison (2), lack (2), difficult (1), top (1), sorts (1), spring (1), profit (1), survival (1), treatment (1), facilities (1), building (1), communities- (1), deleterious (1), separate (1), face (1), stands (1), mine (1), gold (1), threaten (1), vast (1), riv
	# Simple script by Stéfan Sinclair to extract text from Wikileaks Cablegate
	#
	# Usage:
	# ruby cablegateHtml2text [input_directory] [output_directory]
	#
	# If input directory and output directory are not specified, the current directory is used.

	# process a directory recursively
	def process_directory(input_directory, output_directory)
	Dir.new(input_directory).each do \|file\|
	<?xml version="1.0" encoding="UTF-8"?>
	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
	xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0">
	<xsl:output method="html"/>
	<xsl:template match="/">
	<xsl:param name="sortField"/>
	<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
	<head>
	<title>Bibliography</title>
	</head>
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?>
	<documents>
	<document href="ob/16740429.xml" />
	<document href="ob/16740717.xml" />
	<!-- and a couple of thousand more documents -->
	</documents>
	<div class="post-107 [...]">
	<h2 class="entry-title">Interview with Dr. Timothy Powell</h2>
	<div class="post-info">
	<span class="date published time" title="2012-09-18T16:06:08+00:00">September 18, 2012</span>
	By <span class="author vcard">Elizabeth Couchum</a></span>
	</div>
	<div class="entry-content">
	<p>Blog of Dr. Timothy Powell [...]</p>
	</div>
	</div>
	<item>
	<title>Interview with Dr. Timothy Powell</title>
	<pubDate>Tue, 18 Sep 2012 16:06:08 +0000</pubDate>
	<dc:creator>Elizabeth Couchum</dc:creator>
	<content:encoded><![CDATA[<p>Blog of Dr. Timothy Powell [...]</p> ]]></content:encoded>
	</item>
	<?php

	/* BEGIN a set of parameters that can easily be set */

	// the base URL (without trailing slash)
	$url = 'http://digitally.doinghistory.com/category/digital-humanist-interview';

	// the regex that defines the contents of the post – the entire expression is kept
	$post_regex = "/<div class=\"post-(\d+)\s.+?<!-- end \.entry-content -->/s";
	{
	"status": "OK",
	"usage": "By accessing AlchemyAPI [...]",
	"url": "",
	"language": "english",
	"docSentiment": {
	"type": "negative",
	"score": "-0.248111"
	}
	}
	<?php

	/* BEGIN a set of parameters that can easily be set */

	$apikey = ''; // * YOU NEED TO SET THIS TO SOMETHING VALID */

	// input directory (where source files are located)
	$input_directory = dirname(__FILE__) . '/harvested_html';

	/* END a set of parameters that can easily be set */