Skip to content

Instantly share code, notes, and snippets.

@sgsinclair
Created March 18, 2011 17:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sgsinclair/876514 to your computer and use it in GitHub Desktop.
Save sgsinclair/876514 to your computer and use it in GitHub Desktop.
<?php
$file = 'ob/temp/OldBaileyDocuments.html';
$output = 'ob/data'
$handle = fopen($file, 'r');
$id = '';
$content = '';
while (($buffer = fgets($handle)) !== false) {
$isEndDoc = strpos($buffer, '</document>');
$isStartDoc = strpos($buffer, '<document id');
if ($isEndDoc!==false) {
if (preg_match("/(.*?)<\/document>/", $buffer, $match)) {
$content .= $match[1];
writefile($id, $content);
}
else {
die("match not found: $buffer");
}
}
if ($isStartDoc!==false) {
if (preg_match("/<document id=\"(.+?)\">(.*)/", $buffer, $match)) {
$id = $match[1];
$content = $match[2];
}
else {
die("match not found: $buffer");
}
}
if ($isEndDoc===false && $isStartDoc===false && $id) {
$content .= $buffer;
}
writefile($id,$content);
}
if (!feof($handle)) {
echo "Error: unexpected fgets() fail\n";
}
fclose($handle);
function writefile($id, $content) {
echo $id;
if ($id && $content) {
file_put_contents("data/$id.xml", $content);
}
}
?>
<?php
$sourcedir = 'ob/data';
$targetdir = 'ob/data';
$yeartarget = $targetdir . '/year';
$decadetarget = $targetdir . '/decade';
checkdir($yeartarget);
checkdir($decadetarget);
foreach (glob("$sourcedir/*xml") as $file) {
$f = basename($file);
$year = substr($f, 1, 4);
$y = $yeartarget . '/' . $year;
checkdir($y);
copy($file, $y . '/' . $f);
$decade = substr($f, 1, 3) . '0s';
$d = $decadetarget . '/' . $decade;
checkdir($d);
copy($file, $d . '/' . $f);
}
function checkdir($dir) {
if (!file_exists($dir)) {mkdir($dir);}
}
?>
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?>
<documents>
<document href="ob/16740429.xml" />
<document href="ob/16740717.xml" />
<!-- and a couple of thousand more documents -->
</documents>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0">
<xsl:output method="xml"/>
<!-- match root documents tag -->
<xsl:template match="/">
<!-- wrap all documents in a new root tag -->
<combined-documents>
<xsl:apply-templates select="document(/documents/document/@href)" mode="document"/>
</combined-documents>
</xsl:template>
<!-- this will match the root tag of each document -->
<xsl:template match="/" mode="document">
<!-- this is the XPath we're looking for -->
<xsl:apply-templates select="//*[@type='trialAccount']"/>
</xsl:template>
<xsl:template match="*[@type='trialAccount']">
<!-- wrap each document so that we can easily find it afterwards -->
<document id="{@id}">
<xsl:copy-of select="."/>
</document>
</xsl:template>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment