Last active
June 25, 2022 21:31
-
-
Save AlexPernot/245b20ec36bef895d9fac48ad36767f3 to your computer and use it in GitHub Desktop.
A friend of mine wanted to extract all the dialogue from the original Monkey Island into CSVs by character and language. Here's a JS script to do it, it was a fun experiment retro-engineering a 90s game localization file and working with NodeJS Buffers. Usage : put the "speech.info" file next to this script and simply run it with `node index.js`…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Reads the Monkey Island speech.info file and extract the data into several CSVs. | |
*/ | |
const fs = require("fs"); | |
// All the sound file IDs following the file order. E.g.: "GUY_32_alley_1_1". The first three letters are the actor ID. | |
const fileIds = []; | |
// All the lines following the file order | |
const lines = { | |
"en": [], | |
"fr": [], | |
"it": [], | |
"de": [], | |
"es": [] | |
}; | |
const locales = Object.keys(lines); | |
// Change it to ";" if you have a french Windows install and you want Excel support. Thanks Excel. | |
const csvSeparator = ","; | |
const csvData = {}; | |
// Buffer scanning constants, you probably don't want to change those. | |
const lineStartOffset = 16; // Non-printable bytes, maybe a header of some sort, but we don't use it. | |
const lineLength = 256; | |
const speakerLength = 32; | |
const encoding = "latin1"; | |
const cleanString = str => str.replace(/\x00/g, "").replace(/\x01/g, " ").trimEnd(); | |
const escapeCsvString = str => { | |
const escapedString = str.replace(/"/g, '""'); | |
return str.indexOf(csvSeparator) > -1 || str.indexOf("\n") > -1 ? `"${escapedString}"` : escapedString; | |
}; | |
/** | |
* Stores the next lines in the "lines" object. | |
* @param {Buffer} buf The input buffer | |
* @param {Number} startOffset The start position in the buffer where we start reading | |
* @return {Number} The new position in the buffer after reading | |
*/ | |
const readNextLines = (buf, startOffset) => { | |
let currentPosition = startOffset; | |
for (const locale of locales) { | |
lines[locale].push( | |
cleanString(buf.toString(encoding, currentPosition, currentPosition + lineLength)) | |
); | |
currentPosition += lineLength; | |
} | |
return currentPosition; | |
} | |
/** | |
* Stores the next line in the "fileIds" array. | |
* @param {Buffer} buf The input buffer | |
* @param {Number} startOffset The start position in the buffer where we start reading | |
* @return {Number} The new position in the buffer after reading | |
*/ | |
const readNextSpeaker = (buf, startOffset) => { | |
fileIds.push( | |
cleanString(buf.toString(encoding, startOffset, startOffset + speakerLength)) | |
); | |
return startOffset + speakerLength; | |
}; | |
const writeCsv = () => { | |
fs.mkdirSync("csv", {recursive: true}); | |
// For each actor, we write a csv | |
for (let actorId of Object.keys(csvData)) { | |
const writeStream = fs.createWriteStream(`./csv/${actorId}.csv`, {encoding}); | |
writeStream.write('"Sound file ID",English,Français,Italiano,Deutsche,Español\n'); | |
for (let i = 0 ; i < csvData[actorId].fileId.length; i++) { | |
writeStream.write(csvData[actorId].fileId[i]+csvSeparator); | |
writeStream.write(escapeCsvString(csvData[actorId].en[i])+csvSeparator); | |
writeStream.write(escapeCsvString(csvData[actorId].fr[i])+csvSeparator); | |
writeStream.write(escapeCsvString(csvData[actorId].it[i])+csvSeparator); | |
writeStream.write(escapeCsvString(csvData[actorId].de[i])+csvSeparator); | |
writeStream.write(escapeCsvString(csvData[actorId].es[i])+"\n"); | |
} | |
writeStream.close(); | |
} | |
}; | |
fs.readFile("speech.info", (err, input) => { | |
// We read the file and build the "lines" and "fileIds" collections | |
let position = 0; | |
while (position < input.length) { | |
position += lineStartOffset; | |
position = readNextLines(input, position); | |
position = readNextSpeaker(input, position); | |
} | |
// We make a new data structure to sort the lines by actor | |
const actors = []; | |
for (let [i, fileId] of fileIds.entries()) { | |
// Some lines don't have an actor. /shrug | |
const actorId = fileId.slice(0,3) || "___"; | |
// If we have a new actor ID, we build a new sub-object | |
if (!actors.includes(actorId)) { | |
actors.push(actorId); | |
csvData[actorId] = { | |
"fileId": [], | |
"en": [], | |
"fr": [], | |
"it": [], | |
"de": [], | |
"es": [] | |
}; | |
} | |
csvData[actorId].fileId.push(fileId); | |
for (let locale of locales) { | |
csvData[actorId][locale].push(lines[locale][i]); | |
} | |
} | |
// We build the CSVs | |
writeCsv(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment