Last active
December 25, 2017 14:54
-
-
Save Roemerb/d28fcd84988b1e3e9bd80698db9ac318 to your computer and use it in GitHub Desktop.
Convert ACL movie review files to csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"encoding/csv" | |
"errors" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"os" | |
"strconv" | |
"strings" | |
) | |
const ( | |
// ACLTrainDir is the directory of the train data from the ACL set | |
ACLTrainDir = "./aclImdb/train" | |
// KaggleDataDir is the directory of the dataset from Kaggle | |
KaggleDataDir = "./kaggle" | |
) | |
// FileHeader holds the CSV data for the training set | |
var FileHeader = [][]string{{"id", "text", "pos", "rating"}} | |
func main() { | |
// Initiate CSV file | |
csvFile, err := os.Create("data.csv") | |
if err != nil { | |
log.Fatal("Could not create CSV file") | |
} | |
defer csvFile.Close() | |
// Initiate CSV writer | |
writer := csv.NewWriter(csvFile) | |
defer writer.Flush() | |
aclReviews, err := GetACLReviews() | |
if err != nil { | |
log.Fatal(err.Error()) | |
} | |
kaggleReviews, err := GetKaggleReviews() | |
if err != nil { | |
log.Fatal(err.Error()) | |
} | |
allReviews := append(aclReviews, kaggleReviews...) | |
// Combine reviews with full headers | |
fullFile := append(FileHeader, allReviews...) | |
// Write the CSV data | |
for key, value := range fullFile { | |
err := writer.Write(value) | |
if err != nil { | |
log.Println("Could not write line " + strconv.Itoa(key)) | |
} | |
} | |
} | |
// GetKaggleReviews reads the Kaggle data set file and returns it formatted | |
func GetKaggleReviews() ([][]string, error) { | |
var data = [][]string{} | |
f, err := os.Open(KaggleDataDir + "/data.tsv") | |
if err != nil { | |
fmt.Println("Could not open kaggle") | |
return [][]string{}, err | |
} | |
defer f.Close() | |
r := csv.NewReader(bufio.NewReader(f)) | |
// Set the separator to a tab | |
r.Comma = '\t' | |
r.LazyQuotes = true | |
r.FieldsPerRecord = -1 | |
csvData, err := r.ReadAll() | |
if err != nil { | |
return [][]string{}, err | |
} | |
// Extract record from the CSV file. Format is: | |
// | |
// 0: The ID + rating in format "{id}_{rating}" | |
// 1: sentiment (0 or 1) | |
// 2: review text | |
for i, row := range csvData { | |
// Skip the header line | |
if i == 0 { | |
continue | |
} | |
// Trim "" in rating + id | |
row[0] = strings.Replace(row[0], "\"", "", -1) | |
idRating := strings.Split(row[0], "_") | |
if len(idRating) < 2 { | |
return [][]string{}, errors.New("Invalid format: " + row[0]) | |
} | |
// Get ID and rating from splitted string | |
id := idRating[0] | |
rating := idRating[1] | |
// Add data to array | |
data = append(data, []string{id, row[2], row[1], rating}) | |
} | |
return data, nil | |
} | |
// GetACLReviews reads all ACL dataset files and returns a string array | |
// with the ratings | |
func GetACLReviews() ([][]string, error) { | |
// Get the positive reviews | |
posReviews, err := GetRatingsForDir(ACLTrainDir+"/pos", true) | |
if err != nil { | |
log.Fatal("Could not get positive reviews: " + err.Error()) | |
} | |
// Get the negative reviews | |
negReviews, err := GetRatingsForDir(ACLTrainDir+"/neg", false) | |
if err != nil { | |
log.Fatal("Could not get negative reviews: " + err.Error()) | |
} | |
// Combine reviews | |
allReviews := append(posReviews, negReviews...) | |
return allReviews, nil | |
} | |
// GetRatingsForDir reads all the rating files in dir and returns | |
// a well-formatted string with the ratings as string array | |
func GetRatingsForDir(dir string, pos bool) ([][]string, error) { | |
var data = [][]string{} | |
// Read positive train data | |
files, err := ioutil.ReadDir(dir) | |
if err != nil { | |
return [][]string{}, err | |
} | |
for _, file := range files { | |
// Split the file name to get the rating and id | |
// file name format is like {id}_{rating}.txt so we'll | |
// initially split at '_' and then at '.' to remove '.txt' | |
nameParts := strings.Split(file.Name(), "_") | |
if len(nameParts) < 2 { | |
log.Println("Invalid name: " + file.Name()) | |
} | |
// The ID is the first part of the file | |
id := nameParts[0] | |
// rating still includes .txt | |
rating := nameParts[1] | |
// remove .txt | |
rating = strings.Replace(rating, ".txt", "", -1) | |
// Read the file and convert bytes to string | |
textBytes, err := ioutil.ReadFile(dir + "/" + file.Name()) | |
if err != nil { | |
log.Println("Error reading " + file.Name() + ": " + err.Error()) | |
} | |
text := string(textBytes) | |
// Get the correct positivity string | |
var isPos string | |
if pos { | |
isPos = "1" | |
} else { | |
isPos = "0" | |
} | |
// Append id, text, isPos, rating to the data | |
data = append(data, []string{id, text, isPos, rating}) | |
} | |
return data, nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment