AlexPernot/MonkeyIslandSpeechInfo2Csv.js

## MonkeyIslandSpeechInfo2Csv.js
/**
 * Reads the Monkey Island speech.info file and extract the data into several CSVs.
 */
const fs = require("fs");

// All the sound file IDs following the file order. E.g.: "GUY_32_alley_1_1". The first three letters are the actor ID.
const fileIds = [];

// All the lines following the file order
const lines = {
  "en": [],
  "fr": [],
  "it": [],
  "de": [],
  "es": []
};
const locales = Object.keys(lines);
// Change it to ";" if you have a french Windows install and you want Excel support. Thanks Excel.
const csvSeparator = ",";

const csvData = {};

// Buffer scanning constants, you probably don't want to change those.
const lineStartOffset = 16; // Non-printable bytes, maybe a header of some sort, but we don't use it.
const lineLength = 256;
const speakerLength = 32;
const encoding = "latin1";

const cleanString = str => str.replace(/\x00/g, "").replace(/\x01/g, " ").trimEnd();

const escapeCsvString = str => {
  const escapedString = str.replace(/"/g, '""');

  return str.indexOf(csvSeparator) > -1 || str.indexOf("\n") > -1 ? `"${escapedString}"` : escapedString;
};

/**
 * Stores the next lines in the "lines" object.
 * @param {Buffer} buf The input buffer
 * @param {Number} startOffset The start position in the buffer where we start reading
 * @return {Number} The new position in the buffer after reading
 */
const readNextLines = (buf, startOffset) => {
  let currentPosition = startOffset;

  for (const locale of locales) {
    lines[locale].push(
      cleanString(buf.toString(encoding, currentPosition, currentPosition + lineLength))
    );
    currentPosition += lineLength;
  }

  return currentPosition;
}

/**
 * Stores the next line in the "fileIds" array.
 * @param {Buffer} buf The input buffer
 * @param {Number} startOffset The start position in the buffer where we start reading
 * @return {Number} The new position in the buffer after reading
 */
const readNextSpeaker = (buf, startOffset) => {
  fileIds.push(
    cleanString(buf.toString(encoding, startOffset, startOffset + speakerLength))
  );
  return startOffset + speakerLength;
};

const writeCsv = () => {
  fs.mkdirSync("csv", {recursive: true});
  // For each actor, we write a csv
  for (let actorId of Object.keys(csvData)) {
    const writeStream = fs.createWriteStream(`./csv/${actorId}.csv`, {encoding});

    writeStream.write('"Sound file ID",English,Français,Italiano,Deutsche,Español\n');

    for (let i = 0 ; i < csvData[actorId].fileId.length; i++) {
      writeStream.write(csvData[actorId].fileId[i]+csvSeparator);
      writeStream.write(escapeCsvString(csvData[actorId].en[i])+csvSeparator);
      writeStream.write(escapeCsvString(csvData[actorId].fr[i])+csvSeparator);
      writeStream.write(escapeCsvString(csvData[actorId].it[i])+csvSeparator);
      writeStream.write(escapeCsvString(csvData[actorId].de[i])+csvSeparator);
      writeStream.write(escapeCsvString(csvData[actorId].es[i])+"\n");
    }

    writeStream.close();
  }
};

fs.readFile("speech.info", (err, input) => {
  // We read the file and build the "lines" and "fileIds" collections
  let position = 0;

  while (position < input.length) {
    position += lineStartOffset;
    position = readNextLines(input, position);
    position = readNextSpeaker(input, position);
  }

  // We make a new data structure to sort the lines by actor

  const actors = [];

  for (let [i, fileId] of fileIds.entries()) {
    // Some lines don't have an actor. /shrug
    const actorId = fileId.slice(0,3) || "___";

    // If we have a new actor ID, we build a new sub-object
    if (!actors.includes(actorId)) {
      actors.push(actorId);

      csvData[actorId] = {
        "fileId": [],
        "en": [],
        "fr": [],
        "it": [],
        "de": [],
        "es": []
      };
    }

    csvData[actorId].fileId.push(fileId);

    for (let locale of locales) {
      csvData[actorId][locale].push(lines[locale][i]);
    }
  }

  // We build the CSVs
  writeCsv();
});
	/**
	* Reads the Monkey Island speech.info file and extract the data into several CSVs.
	*/
	const fs = require("fs");

	// All the sound file IDs following the file order. E.g.: "GUY_32_alley_1_1". The first three letters are the actor ID.
	const fileIds = [];

	// All the lines following the file order
	const lines = {
	"en": [],
	"fr": [],
	"it": [],
	"de": [],
	"es": []
	};
	const locales = Object.keys(lines);
	// Change it to ";" if you have a french Windows install and you want Excel support. Thanks Excel.
	const csvSeparator = ",";

	const csvData = {};

	// Buffer scanning constants, you probably don't want to change those.
	const lineStartOffset = 16; // Non-printable bytes, maybe a header of some sort, but we don't use it.
	const lineLength = 256;
	const speakerLength = 32;
	const encoding = "latin1";

	const cleanString = str => str.replace(/\x00/g, "").replace(/\x01/g, " ").trimEnd();

	const escapeCsvString = str => {
	const escapedString = str.replace(/"/g, '""');

	return str.indexOf(csvSeparator) > -1 \|\| str.indexOf("\n") > -1 ? `"${escapedString}"` : escapedString;
	};

	/**
	* Stores the next lines in the "lines" object.
	* @param {Buffer} buf The input buffer
	* @param {Number} startOffset The start position in the buffer where we start reading
	* @return {Number} The new position in the buffer after reading
	*/
	const readNextLines = (buf, startOffset) => {
	let currentPosition = startOffset;

	for (const locale of locales) {
	lines[locale].push(
	cleanString(buf.toString(encoding, currentPosition, currentPosition + lineLength))
	);
	currentPosition += lineLength;
	}

	return currentPosition;
	}

	/**
	* Stores the next line in the "fileIds" array.
	* @param {Buffer} buf The input buffer
	* @param {Number} startOffset The start position in the buffer where we start reading
	* @return {Number} The new position in the buffer after reading
	*/
	const readNextSpeaker = (buf, startOffset) => {
	fileIds.push(
	cleanString(buf.toString(encoding, startOffset, startOffset + speakerLength))
	);
	return startOffset + speakerLength;
	};

	const writeCsv = () => {
	fs.mkdirSync("csv", {recursive: true});
	// For each actor, we write a csv
	for (let actorId of Object.keys(csvData)) {
	const writeStream = fs.createWriteStream(`./csv/${actorId}.csv`, {encoding});

	writeStream.write('"Sound file ID",English,Français,Italiano,Deutsche,Español\n');

	for (let i = 0 ; i < csvData[actorId].fileId.length; i++) {
	writeStream.write(csvData[actorId].fileId[i]+csvSeparator);
	writeStream.write(escapeCsvString(csvData[actorId].en[i])+csvSeparator);
	writeStream.write(escapeCsvString(csvData[actorId].fr[i])+csvSeparator);
	writeStream.write(escapeCsvString(csvData[actorId].it[i])+csvSeparator);
	writeStream.write(escapeCsvString(csvData[actorId].de[i])+csvSeparator);
	writeStream.write(escapeCsvString(csvData[actorId].es[i])+"\n");
	}

	writeStream.close();
	}
	};

	fs.readFile("speech.info", (err, input) => {
	// We read the file and build the "lines" and "fileIds" collections
	let position = 0;

	while (position < input.length) {
	position += lineStartOffset;
	position = readNextLines(input, position);
	position = readNextSpeaker(input, position);
	}

	// We make a new data structure to sort the lines by actor

	const actors = [];

	for (let [i, fileId] of fileIds.entries()) {
	// Some lines don't have an actor. /shrug
	const actorId = fileId.slice(0,3) \|\| "___";

	// If we have a new actor ID, we build a new sub-object
	if (!actors.includes(actorId)) {
	actors.push(actorId);

	csvData[actorId] = {
	"fileId": [],
	"en": [],
	"fr": [],
	"it": [],
	"de": [],
	"es": []
	};
	}

	csvData[actorId].fileId.push(fileId);

	for (let locale of locales) {
	csvData[actorId][locale].push(lines[locale][i]);
	}
	}

	// We build the CSVs
	writeCsv();
	});