Skip to content

Instantly share code, notes, and snippets.

@IanSimpson
Last active April 25, 2019 23:08
Show Gist options
  • Save IanSimpson/bc9505a650d65251f5ec5faf0eed37ce to your computer and use it in GitHub Desktop.
Save IanSimpson/bc9505a650d65251f5ec5faf0eed37ce to your computer and use it in GitHub Desktop.
LightTag to AWS Comprehend annotation conversion tool
<?php
/**
* index.php
*
* LightTag to AWS Comprehend annotation converter
*
* @author Ian Simpson <ian@logicstudio.nz>
* @copyright 2019 The Logic Studio Limited
* @license GPL 3
* @license https://opensource.org/licenses/GPL-3.0 GNU General Public License version 3
*
* To use, drop this file in a folder, along with a folder called "input". Put the JSON files exported from LightTag into
* the "input" folder, then run this script. As output you will get a file "out.csv", and a folder "output" containing all
* source texts. Put this on S3, point AWS Comprehend at it, and it should magically work.
*
*/
// Open our output file
$fh = fopen('out.csv', 'w');
// Add a header line
fputcsv($fh, ['File', 'Line', 'Begin Offset', 'End Offset', 'Type']);
// Make sure the input and output folders are ready
if(!file_exists('output')) mkdir('output');
if(!file_exists('input')) mkdir('input');
// We'll keep track of completed texts to ensure no duplicates
$done = [];
// Find all the files in the input folder, and loop through them
$files = scandir('input');
foreach($files as $f) if(substr($f,0,1)!='.') {
// Get and parse the input file
$in = json_decode(file_get_contents('input/'.$f));
// Loop through each text
foreach($in->annotations_and_examples as $a) {
// Get a unique hash of the text to serve as a unique ID
$hash = md5($a->content);
//Make sure we only process each text once
if(!in_array($hash, $done)) {
$done[] = $hash;
// Output the text to a file ready for upload to S3
file_put_contents('output/'.$hash, $a->content);
// Loop through each annotation
foreach($a->annotations as $an) {
// Track the line number and character offset
$line = 0;
$offset = -1;
// Iterate through each character as the string. When we hit a newline, increment the line number
for($i=0; $i<=$an->start; $i++) {
$offset++;
$char = mb_substr($a->content, $i, 1);
if($char == "\n") {
$line++;
$offset = -1;
}
}
$end = $offset-1;
$length = $an->end - $an->start;
// Continue iterating through - multi-line annotations need to be logged as two separate annotations for Comprehend
for($i = $an->start; $i <= $an->end; $i++) {
$char = mb_substr($a->content, $i, 1);
$end++;
//Output the CSV line either at the end of the line, or the end of the annotation
if($char == "\n" || $i == $an->end) {
if($end>$offset) fputcsv($fh, [
$hash, // File
$line, // Line
$offset, // Begin Offset
$end, // End Offset
str_replace(' ', '_', strtoupper($an->tag)), // Type
]);
$line++;
$offset = 0;
$end = -1;
}
}
}
}
}
}
//Clean up behind ourselves
fclose($fh);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment