Skip to content

Instantly share code, notes, and snippets.

@sisteamnik
Created November 25, 2014 21:06
Show Gist options
  • Save sisteamnik/6945f8bc7172f8f30ac5 to your computer and use it in GitHub Desktop.
Save sisteamnik/6945f8bc7172f8f30ac5 to your computer and use it in GitHub Desktop.
Greedy algorithm for removing whitespaces from text
package main
import (
"github.com/sisteamnik/GoOse"
"io/ioutil"
"regexp"
"strings"
)
var u = "http://www.itnews.com/drives/86430/want-100tb-disk-drive-youll-have-wait-til-2025"
func main() {
g := goose.New()
article := g.ExtractFromUrl(u)
println("content", article.CleanedText)
str := clean(article.CleanedText)
ioutil.WriteFile("out.txt", []byte(str), 0777)
}
func clean(str string) string {
var normalizeWhitespaceRegexp = regexp.MustCompile(`[ \r\f\v\t]+`)
str = normalizeWhitespaceRegexp.ReplaceAllString(str, " ")
strArr := strings.Split(str, "\n")
resArr := []string{}
for i, v := range strArr {
v = strings.TrimSpace(v)
if v != "" {
resArr = append(resArr, v)
} else if strArr[i-2] != "" {
resArr = append(resArr, "")
}
}
str = strings.Join(resArr, "\n")
//in the future may be used as markdown paragraph
var normalizeNl = regexp.MustCompile(`\n\n[\n]+`)
str = normalizeNl.ReplaceAllString(str, "\n\n")
return str
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment