Stéfan Sinclair sgsinclair

## scrape-wwp.php
<?php

/**
 * A very simple script to scrape WWP content (note these are the HTML versions not the source XML).
 * This assumes that the site is open, as it is/was during March 2014.
 * Usage (from the command line): php scrape-wwp.php
 * Versions:
 *  1.0 (March 3, 2014): initial release
 * License: CC-BY (don't blame me if this reformats your drive)
 * Author: Stéfan Sinclair

## convertXml2Text.php
<?php

/**
 * A very simple script convert files from XML to plain text.
 * This assumes that you're on a unix-like system that supports the find command.
 * Usage (from the command line): php convertXml2text.php
 * Versions:
 *  1.0 (March 4, 2014): initial release
 * License: CC-BY (don't blame me if this reformats your drive)
 * Author: Stéfan Sinclair

## sshrcTsvToAnnualTitles.php
<?php

/**
 * A very simple script to parse SSHRC awards titles from Excel results into annual files.
 * It assumes you've downloaded results into Excel from here:
 * 		http://www.outil.ost.uqam.ca/CRSH/RechProj.aspx?vLangue=Anglais
 * and then saved the worksheet as tab separated values in a file called sshrc.txt
 * Usage (from the command line): php sshrcTsvToAnnualTitles.php
 * Versions:
 *  1.0 (April 8, 2014): initial release

## musicMetadata2text.php
<?php

// read in contents, don't use file() to have flexible handling of newline characters
$contents = file_get_contents(dirname(__FILE__) . "/tonyb-dedup-meta-for-cluster-transcript.csv");
$lines = preg_split("/(\r\n)|\r|\n/", $contents);
array_shift($lines);

// build text files
$lastStartTime = 0;
foreach($lines as $line) {

## orlando-rdf.js
var rdf = [{"about":"abdyma","label":"FEMALE","predicate":"hasSex"},{"about":"abdyma","label":"Maria","predicate":"hasGivenName"},{"about":"abdyma","label":"Smith","predicate":"hasSurname"},{"about":"abdyma","label":"Abdy","predicate":"hasMarriedName"},{"about":"abdyma","label":"M. A.","predicate":"hasPseudonym"},{"about":"abdyma","label":"Mira","predicate":"hasNickname"},{"about":"abdyma","label":"1797-02-25","predicate":"born"},{"about":"abdyma","label":"ONLY","predicate":"birthPosition"},{"about":"abdyma","label":"London","predicate":"birthSettlement"},{"about":"abdyma","label":"Middlesex","predicate":"birthRegion"},{"about":"abdyma","label":"English","predicate":"nationality"},{"about":"abdyma","label":"professional","predicate":"socialClass"},{"about":"abdyma","label":"PROFESSIONAL","predicate":"socialClass"},{"about":"abdyma","label":"Anglican Church","predicate":"hasDenomination"},{"about":"abdyma","label":"Dissenters","predicate":"hasDenomination"},{"about":"abdyma","label":"presumably white","predica

## holmes.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                sgsinclair
                / holmes.ipynb
            
            
              Last active
              August 29, 2015 14:17
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## test.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                sgsinclair
                / test.ipynb
            
            
              Created
              March 24, 2015 17:07
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## cablegate html to text
# *Simple* script by Stéfan Sinclair to extract text from Wikileaks Cablegate
#
# Usage:
#  ruby cablegateHtml2text [input_directory] [output_directory]
#
# If input directory and output directory are not specified, the current directory is used.

# process a directory recursively
def process_directory(input_directory, output_directory)
  Dir.new(input_directory).each do |file|

## bibsort.xsl
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0">
    <xsl:output method="html"/>
    <xsl:template match="/">
        <xsl:param name="sortField"/>
        <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
            <head>
                <title>Bibliography</title>
            </head>

## OldBaileyDocuments.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?>
<documents>
<document href="ob/16740429.xml" />
<document href="ob/16740717.xml" />
<!-- and a couple of thousand more documents -->
</documents>
	<?php

	/**
	* A very simple script to scrape WWP content (note these are the HTML versions not the source XML).
	* This assumes that the site is open, as it is/was during March 2014.
	* Usage (from the command line): php scrape-wwp.php
	* Versions:
	* 1.0 (March 3, 2014): initial release
	* License: CC-BY (don't blame me if this reformats your drive)
	* Author: Stéfan Sinclair
	<?php

	/**
	* A very simple script convert files from XML to plain text.
	* This assumes that you're on a unix-like system that supports the find command.
	* Usage (from the command line): php convertXml2text.php
	* Versions:
	* 1.0 (March 4, 2014): initial release
	* License: CC-BY (don't blame me if this reformats your drive)
	* Author: Stéfan Sinclair
	<?php

	/**
	* A very simple script to parse SSHRC awards titles from Excel results into annual files.
	* It assumes you've downloaded results into Excel from here:
	* http://www.outil.ost.uqam.ca/CRSH/RechProj.aspx?vLangue=Anglais
	* and then saved the worksheet as tab separated values in a file called sshrc.txt
	* Usage (from the command line): php sshrcTsvToAnnualTitles.php
	* Versions:
	* 1.0 (April 8, 2014): initial release
	<?php

	// read in contents, don't use file() to have flexible handling of newline characters
	$contents = file_get_contents(dirname(__FILE__) . "/tonyb-dedup-meta-for-cluster-transcript.csv");
	$lines = preg_split("/(\r\n)\|\r\|\n/", $contents);
	array_shift($lines);

	// build text files
	$lastStartTime = 0;
	foreach($lines as $line) {
	# Simple script by Stéfan Sinclair to extract text from Wikileaks Cablegate
	#
	# Usage:
	# ruby cablegateHtml2text [input_directory] [output_directory]
	#
	# If input directory and output directory are not specified, the current directory is used.

	# process a directory recursively
	def process_directory(input_directory, output_directory)
	Dir.new(input_directory).each do \|file\|
	<?xml version="1.0" encoding="UTF-8"?>
	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
	xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0">
	<xsl:output method="html"/>
	<xsl:template match="/">
	<xsl:param name="sortField"/>
	<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
	<head>
	<title>Bibliography</title>
	</head>
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?>
	<documents>
	<document href="ob/16740429.xml" />
	<document href="ob/16740717.xml" />
	<!-- and a couple of thousand more documents -->
	</documents>