Skip to content

Instantly share code, notes, and snippets.

@Roemerb
Last active December 25, 2017 14:54
Show Gist options
  • Save Roemerb/d28fcd84988b1e3e9bd80698db9ac318 to your computer and use it in GitHub Desktop.
Save Roemerb/d28fcd84988b1e3e9bd80698db9ac318 to your computer and use it in GitHub Desktop.
Convert ACL movie review files to csv
package main
import (
"bufio"
"encoding/csv"
"errors"
"fmt"
"io/ioutil"
"log"
"os"
"strconv"
"strings"
)
const (
// ACLTrainDir is the directory of the train data from the ACL set
ACLTrainDir = "./aclImdb/train"
// KaggleDataDir is the directory of the dataset from Kaggle
KaggleDataDir = "./kaggle"
)
// FileHeader holds the CSV data for the training set
var FileHeader = [][]string{{"id", "text", "pos", "rating"}}
func main() {
// Initiate CSV file
csvFile, err := os.Create("data.csv")
if err != nil {
log.Fatal("Could not create CSV file")
}
defer csvFile.Close()
// Initiate CSV writer
writer := csv.NewWriter(csvFile)
defer writer.Flush()
aclReviews, err := GetACLReviews()
if err != nil {
log.Fatal(err.Error())
}
kaggleReviews, err := GetKaggleReviews()
if err != nil {
log.Fatal(err.Error())
}
allReviews := append(aclReviews, kaggleReviews...)
// Combine reviews with full headers
fullFile := append(FileHeader, allReviews...)
// Write the CSV data
for key, value := range fullFile {
err := writer.Write(value)
if err != nil {
log.Println("Could not write line " + strconv.Itoa(key))
}
}
}
// GetKaggleReviews reads the Kaggle data set file and returns it formatted
func GetKaggleReviews() ([][]string, error) {
var data = [][]string{}
f, err := os.Open(KaggleDataDir + "/data.tsv")
if err != nil {
fmt.Println("Could not open kaggle")
return [][]string{}, err
}
defer f.Close()
r := csv.NewReader(bufio.NewReader(f))
// Set the separator to a tab
r.Comma = '\t'
r.LazyQuotes = true
r.FieldsPerRecord = -1
csvData, err := r.ReadAll()
if err != nil {
return [][]string{}, err
}
// Extract record from the CSV file. Format is:
//
// 0: The ID + rating in format "{id}_{rating}"
// 1: sentiment (0 or 1)
// 2: review text
for i, row := range csvData {
// Skip the header line
if i == 0 {
continue
}
// Trim "" in rating + id
row[0] = strings.Replace(row[0], "\"", "", -1)
idRating := strings.Split(row[0], "_")
if len(idRating) < 2 {
return [][]string{}, errors.New("Invalid format: " + row[0])
}
// Get ID and rating from splitted string
id := idRating[0]
rating := idRating[1]
// Add data to array
data = append(data, []string{id, row[2], row[1], rating})
}
return data, nil
}
// GetACLReviews reads all ACL dataset files and returns a string array
// with the ratings
func GetACLReviews() ([][]string, error) {
// Get the positive reviews
posReviews, err := GetRatingsForDir(ACLTrainDir+"/pos", true)
if err != nil {
log.Fatal("Could not get positive reviews: " + err.Error())
}
// Get the negative reviews
negReviews, err := GetRatingsForDir(ACLTrainDir+"/neg", false)
if err != nil {
log.Fatal("Could not get negative reviews: " + err.Error())
}
// Combine reviews
allReviews := append(posReviews, negReviews...)
return allReviews, nil
}
// GetRatingsForDir reads all the rating files in dir and returns
// a well-formatted string with the ratings as string array
func GetRatingsForDir(dir string, pos bool) ([][]string, error) {
var data = [][]string{}
// Read positive train data
files, err := ioutil.ReadDir(dir)
if err != nil {
return [][]string{}, err
}
for _, file := range files {
// Split the file name to get the rating and id
// file name format is like {id}_{rating}.txt so we'll
// initially split at '_' and then at '.' to remove '.txt'
nameParts := strings.Split(file.Name(), "_")
if len(nameParts) < 2 {
log.Println("Invalid name: " + file.Name())
}
// The ID is the first part of the file
id := nameParts[0]
// rating still includes .txt
rating := nameParts[1]
// remove .txt
rating = strings.Replace(rating, ".txt", "", -1)
// Read the file and convert bytes to string
textBytes, err := ioutil.ReadFile(dir + "/" + file.Name())
if err != nil {
log.Println("Error reading " + file.Name() + ": " + err.Error())
}
text := string(textBytes)
// Get the correct positivity string
var isPos string
if pos {
isPos = "1"
} else {
isPos = "0"
}
// Append id, text, isPos, rating to the data
data = append(data, []string{id, text, isPos, rating})
}
return data, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment