Skip to content

Instantly share code, notes, and snippets.

@hongruiqi
Forked from shanehou/cnki-search.go
Last active December 23, 2015 01:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hongruiqi/6560317 to your computer and use it in GitHub Desktop.
Save hongruiqi/6560317 to your computer and use it in GitHub Desktop.
package main
import (
"encoding/csv"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"os"
"regexp"
"strings"
"sync"
"time"
)
func processData(input string, output string) (err error) {
inputFile, err := os.Open(input)
outputFile, err := os.Create(output)
if err != nil {
return
}
defer inputFile.Close()
defer outputFile.Close()
reader := csv.NewReader(inputFile)
writer := csv.NewWriter(outputFile)
inChan := make([]string, 10)
outChan := make([]string, 10)
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
go func() {
for record := range inChan {
count, err := searchCount(record[0], record[1], strings.Replace(record[2], " ", "", -1))
if err != nil {
log.Println(err)
}
record = append(record, count)
outChan <- record
wg.Done()
}
}()
}
outputDone := make(chan bool)
go func() {
for record := range outChan {
err = writer.Write(record)
if err != nil {
log.Println(err)
}
}
outputDone <- true
}()
for record, err := reader.Read(); err != io.EOF; record, err = reader.Read() {
if err != nil {
return err
}
wg.Add(1)
inChan <- record
fmt.Println(record)
}
close(inChan)
wg.Wait()
close(outChan)
<-outputDone
writer.Flush()
return nil
}
func searchCount(stock string, year string, name string) (count string, err error) {
u, err := url.Parse("http://epub.cnki.net")
if err != nil {
return
}
u.Path += "/KNS/request/SearchHandler.ashx"
v := url.Values{}
v.Add("action", "")
v.Add("NaviCode", "*")
v.Add("ua", "1.21")
v.Add("PageName", "ASP.brief_result_aspx")
v.Add("DbPrefix", "CCND")
v.Add("DbCatalog", "中国重要报纸全文数据库")
v.Add("ConfigFile", "CCND.xml")
v.Add("db_opt", "中国重要报纸全文数据库")
v.Add("db_value", "中国重要报纸全文数据库")
v.Add("magazine_value1", "中国证券报+上海证券报+证券时报+证券日报")
v.Add("magazine_special1", "=")
v.Add("publishdate_from", year+"-01-01")
v.Add("publishdate_to", year+"-12-31")
v.Add("au_1_sel", "AU")
v.Add("au_1_special1", "=")
v.Add("txt_1_sel", "FT")
v.Add("txt_1_value1", stock)
v.Add("txt_1_value2", name)
v.Add("txt_1_relation", "#CNKI_OR")
v.Add("txt_1_special1", "%")
v.Add("his", "0")
const layout = "Mon Jan 02 2006 15:04:05 GMT+0800 (CST)"
t := time.Now()
v.Add("__", t.Format(layout))
u.RawQuery = v.Encode()
jar, err := cookiejar.New(nil)
if err != nil {
return
}
c := &http.Client{nil, nil, jar}
resp, err := c.Get(u.String())
if err != nil {
return
}
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
}
resp.Body.Close()
resp, err = c.Get("http://epub.cnki.net/kns/brief/brief.aspx?pagename=" + string(b))
if err != nil {
return
}
b, err = ioutil.ReadAll(resp.Body)
if err != nil {
return
}
re := regexp.MustCompile("&nbsp;找到&nbsp;(\\d+)&nbsp;条结果&nbsp;")
resp.Body.Close()
searchResult := re.FindSubmatch(b)
count = string(searchResult[1])
return
}
func main() {
err := processData("testdata.csv", "output.csv")
if err != nil {
panic(err)
}
}
000012 2010 南 玻A 71237587 59649365 74
000012 2011 南 玻A 97496836 84435273 88
000014 2010 沙河股份 12821932.22 12821932.22 14
000014 2011 沙河股份 9785185.39 9785185.39 10
000020 2010 深华发A 1939765 1201976.99 8
000020 2011 深华发A 5926290.77 4925343.09 2
000020 2012 深华发A 6449675.03 5142319.36 17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment