Created
March 18, 2011 17:42
-
-
Save sgsinclair/876514 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$file = 'ob/temp/OldBaileyDocuments.html'; | |
$output = 'ob/data' | |
$handle = fopen($file, 'r'); | |
$id = ''; | |
$content = ''; | |
while (($buffer = fgets($handle)) !== false) { | |
$isEndDoc = strpos($buffer, '</document>'); | |
$isStartDoc = strpos($buffer, '<document id'); | |
if ($isEndDoc!==false) { | |
if (preg_match("/(.*?)<\/document>/", $buffer, $match)) { | |
$content .= $match[1]; | |
writefile($id, $content); | |
} | |
else { | |
die("match not found: $buffer"); | |
} | |
} | |
if ($isStartDoc!==false) { | |
if (preg_match("/<document id=\"(.+?)\">(.*)/", $buffer, $match)) { | |
$id = $match[1]; | |
$content = $match[2]; | |
} | |
else { | |
die("match not found: $buffer"); | |
} | |
} | |
if ($isEndDoc===false && $isStartDoc===false && $id) { | |
$content .= $buffer; | |
} | |
writefile($id,$content); | |
} | |
if (!feof($handle)) { | |
echo "Error: unexpected fgets() fail\n"; | |
} | |
fclose($handle); | |
function writefile($id, $content) { | |
echo $id; | |
if ($id && $content) { | |
file_put_contents("data/$id.xml", $content); | |
} | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$sourcedir = 'ob/data'; | |
$targetdir = 'ob/data'; | |
$yeartarget = $targetdir . '/year'; | |
$decadetarget = $targetdir . '/decade'; | |
checkdir($yeartarget); | |
checkdir($decadetarget); | |
foreach (glob("$sourcedir/*xml") as $file) { | |
$f = basename($file); | |
$year = substr($f, 1, 4); | |
$y = $yeartarget . '/' . $year; | |
checkdir($y); | |
copy($file, $y . '/' . $f); | |
$decade = substr($f, 1, 3) . '0s'; | |
$d = $decadetarget . '/' . $decade; | |
checkdir($d); | |
copy($file, $d . '/' . $f); | |
} | |
function checkdir($dir) { | |
if (!file_exists($dir)) {mkdir($dir);} | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<?xml-stylesheet type="text/xsl" href="OldBaileyDocumentsCombine.xsl"?> | |
<documents> | |
<document href="ob/16740429.xml" /> | |
<document href="ob/16740717.xml" /> | |
<!-- and a couple of thousand more documents --> | |
</documents> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:xs="http://www.w3.org/2001/XMLSchema" version="2.0"> | |
<xsl:output method="xml"/> | |
<!-- match root documents tag --> | |
<xsl:template match="/"> | |
<!-- wrap all documents in a new root tag --> | |
<combined-documents> | |
<xsl:apply-templates select="document(/documents/document/@href)" mode="document"/> | |
</combined-documents> | |
</xsl:template> | |
<!-- this will match the root tag of each document --> | |
<xsl:template match="/" mode="document"> | |
<!-- this is the XPath we're looking for --> | |
<xsl:apply-templates select="//*[@type='trialAccount']"/> | |
</xsl:template> | |
<xsl:template match="*[@type='trialAccount']"> | |
<!-- wrap each document so that we can easily find it afterwards --> | |
<document id="{@id}"> | |
<xsl:copy-of select="."/> | |
</document> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment