Last active
March 10, 2016 03:25
-
-
Save sosoyososo/55d6ecbb9d602d4db558 to your computer and use it in GitHub Desktop.
在esou.com使用任意小说的任意一章的url作为参数,获取这本小说的所有文本内容,并按照章节保存为txt (貌似已经因为esou网站的更新不管用了)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"net/http" | |
"os" | |
"strings" | |
) | |
func getHtmlContentWithUrl(url string) []byte { | |
resp, err := http.Get(url) | |
defer resp.Body.Close() | |
body, err := ioutil.ReadAll(resp.Body) | |
if err == nil { | |
return body[:] | |
} else { | |
return []byte{} | |
} | |
} | |
func isPathExist(path string) bool { | |
_, err := os.Stat(path) | |
if err == nil { | |
return true | |
} | |
if os.IsNotExist(err) { | |
return false | |
} | |
return false | |
} | |
func makeDir(dir string) string { | |
if isPathExist(dir) == false { | |
os.Mkdir(dir, 0700) | |
return dir | |
} | |
return "" | |
} | |
func makeFileWithBytes(url string, content []byte) { | |
relativePath := "./pages/" | |
makeDir(relativePath) | |
relativePath += url | |
fmt.Println(relativePath) | |
file, err := os.Create(relativePath) | |
if nil == err { | |
defer file.Close() | |
file.Write(content) | |
} | |
} | |
func hasNextPage(currentPage string) bool { | |
mainTagContent := ">下章</a>" | |
index := strings.Index(currentPage, mainTagContent) | |
if index != -1 { | |
return true | |
} | |
return false | |
} | |
func getPageContent(content string) (string, string) { | |
title := "" | |
txt := "" | |
titleStart := "class=\"easou_tit2\">" | |
index := strings.Index(content, titleStart) | |
if -1 != index { | |
subcontent := content[index+len(titleStart):] | |
index = strings.Index(subcontent, "<") | |
title = subcontent[:index] | |
// fmt.Println(title) | |
subcontent = subcontent[index:] | |
contentStart := "class=\"easou_con\">" | |
index = strings.Index(subcontent, contentStart) | |
subcontent = subcontent[index+len(contentStart):] | |
index = strings.Index(subcontent, "</div>") | |
txt = subcontent[:index] | |
txt = strings.Replace(txt, "<br/>", "\n", -1) | |
// fmt.Println(txt) | |
} | |
return title, txt | |
} | |
func savePage(content string, page int) { | |
title, txt := getPageContent(content) | |
pageContent := title + "\n\n" + txt | |
makeFileWithBytes(fmt.Sprintf("%d.html", page), []byte(pageContent)) | |
} | |
func getPage(url string, page int) bool { | |
bytes := getHtmlContentWithUrl(url) | |
hasNext := hasNextPage(string(bytes)) | |
savePage(string(bytes), page) | |
// makeFileWithBytes(fmt.Sprintf("%d.html", page), bytes) | |
return hasNext | |
} | |
func pageUrl(page int, nid string) string { | |
return fmt.Sprintf("http://book.easou.com/c/show.m?&nid=%s&st=%d", nid, page) | |
} | |
func getContent(nid string) { | |
st := 1 | |
for { | |
url := pageUrl(st, nid) | |
fmt.Println(url) | |
hasNext := getPage(url, st) | |
if !hasNext { | |
break | |
} | |
st++ | |
} | |
} | |
func getNid(url string) string { | |
index := strings.Index(url, "nid=") | |
if index != -1 { | |
subUrl := url[index+4:] | |
index = strings.Index(subUrl, "&") | |
if index != -1 { | |
return subUrl[:index] | |
} else { | |
return subUrl | |
} | |
} | |
return "" | |
} | |
func main() { | |
url := os.Args[1] | |
getContent(getNid(url)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment