Created
April 22, 2018 16:46
-
-
Save awesomephant/e709a9c9ffeea5901c8589888c785661 to your computer and use it in GitHub Desktop.
Switchboard Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const parse = require('csv-parse/lib/sync'); | |
const mkdirp = require('mkdirp'); | |
const dataDir = './data/switchboard_conversations/'; | |
const outputDir = './data/clean/'; | |
let topics = fs.readdirSync(dataDir) | |
let data = []; | |
for (let i = 0; i < topics.length; i++) { | |
let topic = topics[i] | |
let files = fs.readdirSync(dataDir + topic) | |
console.log('Topic: ' + topic); | |
console.log(files.length + ' dialogues found.\n'); | |
for (let i = 0; i < files.length; i++) { | |
let filename = files[i]; | |
let file = fs.readFileSync(dataDir + topic + '/' + filename, 'utf-8') | |
let records = parse(file, { columns: true }); | |
// Let's make a clean text file | |
let dialogue = ''; | |
dialogue += records[0].prompt + '\nSWDA_Filename' + records[0].swda_filename + '\nConversation No' + records[0].conversation_no + '\n\n' | |
for (let a = 0; a < records.length; a++) { | |
dialogue += records[a].caller + '.' + records[a].utterance_index + ': ' + records[a].clean_text + '\n' | |
} | |
let outputPath = outputDir + topic; | |
mkdirp(outputPath, function (err) { | |
fs.writeFileSync(outputDir + topic + '/' + filename + '.txt', dialogue, 'utf-8') | |
}); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment