Created
April 27, 2018 12:49
-
-
Save toVersus/0b999dbf31a9d64226db9d9390ff7d86 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #72: Extract origins by removing the element matched with stop words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"os" | |
"strings" | |
porterstemmer "github.com/reiver/go-porterstemmer" | |
) | |
var StopWords = []string{"i", "me", "my", "myself", "we", "our", "ours", "ourselves", | |
"you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", | |
"she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", | |
"what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", | |
"be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", | |
"a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while"} | |
type review struct { | |
origins []string | |
sentiment string | |
} | |
func newReviews(path string) reviews { | |
f, _ := os.Open(path) | |
defer f.Close() | |
sc := bufio.NewScanner(f) | |
reviews := reviews{} | |
for sc.Scan() { | |
words := strings.Fields(sc.Text()) | |
review := review{ | |
sentiment: words[0], | |
} | |
for _, word := range words[1:] { | |
if isStopWord(word) || isMetaCharacter(word) { | |
continue | |
} | |
review.origins = append(review.origins, word) | |
} | |
reviews = append(reviews, review) | |
} | |
return reviews | |
} | |
type reviews []review | |
func (reviews *reviews) getStem() reviews { | |
for _, review := range *reviews { | |
for i, origin := range review.origins { | |
review.origins[i] = porterstemmer.StemString(origin) | |
} | |
} | |
return *reviews | |
} | |
func main() { | |
var filePath string | |
flag.StringVar(&filePath, "file", "", "specify a file path") | |
flag.StringVar(&filePath, "f", "", "specify a file path") | |
flag.Parse() | |
if _, err := os.Stat(filePath); err != nil { | |
fmt.Fprintf(os.Stderr, "could not find a file: %s\n %#v", filePath, err) | |
os.Exit(1) | |
} | |
reviews := newReviews(filePath) | |
fmt.Printf("%#v\n", reviews.getStem()) | |
} | |
func isStopWord(word string) bool { | |
dict := make(map[string]struct{}, len(StopWords)) | |
for _, w := range StopWords { | |
dict[w] = struct{}{} | |
} | |
if _, ok := dict[word]; ok { | |
return ok | |
} | |
return false | |
} | |
var MetaCharacters = []string{ | |
".", ",", ":", ";", "(", ")", "!", "?", "/", "\\", | |
"{", "}", "-", "_", "[", "]", "~", "'", "--", "\"", | |
} | |
func isMetaCharacter(word string) bool { | |
dict := make(map[string]struct{}, len(MetaCharacters)) | |
for _, meta := range MetaCharacters { | |
dict[meta] = struct{}{} | |
} | |
if _, ok := dict[word]; ok { | |
return ok | |
} | |
return false | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"os" | |
"reflect" | |
"strings" | |
"testing" | |
) | |
var extractOriginTests = []struct { | |
name string | |
file string | |
text string | |
want reviews | |
}{ | |
{ | |
name: "should extract formatted words", | |
file: "full-test.txt", | |
text: `+1 the bard as black comedy -- willie would have loved it . | |
-1 it's a frankenstein-monster of a film that doesn't know what it wants to be . | |
`, | |
want: reviews{ | |
review{origins: []string{"bard", "black", "comedi", "willi", "would", "love"}, sentiment: "+1"}, | |
review{origins: []string{"it'", "frankenstein-monst", "of", "film", "doesn't", "know", "want", "to"}, sentiment: "-1"}, | |
}, | |
}, | |
} | |
func TestExtractOrigins(t *testing.T) { | |
for _, testcase := range extractOriginTests { | |
t.Log(testcase.name) | |
f, err := os.Create(testcase.file) | |
if err != nil { | |
t.Errorf("could not create a file: %s\n %s", testcase.file, err) | |
} | |
f.WriteString(testcase.text) | |
f.Close() | |
f, err = os.Open(testcase.file) | |
if err != nil { | |
t.Errorf("could not open a file: %s\n %s", testcase.file, err) | |
} | |
sc := bufio.NewScanner(f) | |
reviews := reviews{} | |
for sc.Scan() { | |
words := strings.Fields(sc.Text()) | |
review := review{ | |
sentiment: words[0], | |
} | |
for _, word := range words[1:] { | |
if isStopWord(word) { | |
continue | |
} | |
if isMetaCharacter(word) { | |
continue | |
} | |
review.origins = append(review.origins, word) | |
} | |
reviews = append(reviews, review) | |
} | |
result := reviews.getStem() | |
if !reflect.DeepEqual(result, testcase.want) { | |
t.Errorf("want => %#v\n expect => %#v", result, testcase.want) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment