Skip to content

Instantly share code, notes, and snippets.

@akesling
Forked from cdfox/preprocess.go
Last active December 15, 2015 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akesling/5328059 to your computer and use it in GitHub Desktop.
Save akesling/5328059 to your computer and use it in GitHub Desktop.
// For each line of the input file, remove nonalphanumeric characters,
// lowercase all letters, remove stopwords, and write the result to the output
// file. I used the answer here as a template for reading/writing files:
// http://stackoverflow.com/questions/1821811/how-to-read-write-from-to-file/9739903#9739903
package main
import (
"bufio"
"fmt"
"io"
"os"
"regexp"
"strings"
)
func main() {
if len(os.Args) < 4 {
fmt.Printf("Too few arguments. Usage: %s inputFile stopwordFile outputFile \n", os.Args[0])
os.Exit(1)
}
inFile, err := os.Open(os.Args[1])
if err != nil {
panic(err)
}
defer inFile.Close()
reader := bufio.NewReader(infile)
stopwordFile, err := os.Open(os.Args[2])
if err != nil {
panic(err)
}
defer stopWordFile.Close()
stopwordReader := bufio.NewReader(stopwordfile)
stopwords := make(map[string]bool)
for {
line, err := stopwordReader.ReadString('\n')
if err != nil && err != io.EOF {
panic(err)
}
word := strings.TrimSpace(line)
stopwords[word] = true
if err == io.EOF {
break
}
}
outFile, err := os.Create(os.Args[3])
if err != nil {
panic(err)
}
defer outFile.Close()
writer := bufio.NewWriter(outfile)
// remove nonalphanumeric characters, lowercase,
// and remove stopwords for each line
for {
line, r_err := reader.ReadString('\n')
if r_err != nil && r_err != io.EOF {
panic(r_err)
}
nonalphanumeric, err := regexp.Compile(`\W`)
if err != nil {
panic(err)
}
alphanumeric := nonalphanumeric.ReplaceAllString(line, " ")
lowercase := strings.ToLower(alphanumeric)
tokens := strings.Fields(lowercase)
filtered := []string{}
for _, word := range tokens {
if !stopwords[word] {
filtered = append(filtered, word)
}
}
if len(filtered) > 0 {
csv := strings.Join(filtered, ",")
if _, err := writer.WriteString(csv + "\n"); err != nil {
panic(err)
}
}
if r_err == io.EOF {
break
}
}
if err = writer.Flush(); err != nil {
panic(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment