-
-
Save hongruiqi/6560317 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"net/http/cookiejar" | |
"net/url" | |
"os" | |
"regexp" | |
"strings" | |
"sync" | |
"time" | |
) | |
func processData(input string, output string) (err error) { | |
inputFile, err := os.Open(input) | |
outputFile, err := os.Create(output) | |
if err != nil { | |
return | |
} | |
defer inputFile.Close() | |
defer outputFile.Close() | |
reader := csv.NewReader(inputFile) | |
writer := csv.NewWriter(outputFile) | |
inChan := make([]string, 10) | |
outChan := make([]string, 10) | |
var wg sync.WaitGroup | |
for i := 0; i < 10; i++ { | |
go func() { | |
for record := range inChan { | |
count, err := searchCount(record[0], record[1], strings.Replace(record[2], " ", "", -1)) | |
if err != nil { | |
log.Println(err) | |
} | |
record = append(record, count) | |
outChan <- record | |
wg.Done() | |
} | |
}() | |
} | |
outputDone := make(chan bool) | |
go func() { | |
for record := range outChan { | |
err = writer.Write(record) | |
if err != nil { | |
log.Println(err) | |
} | |
} | |
outputDone <- true | |
}() | |
for record, err := reader.Read(); err != io.EOF; record, err = reader.Read() { | |
if err != nil { | |
return err | |
} | |
wg.Add(1) | |
inChan <- record | |
fmt.Println(record) | |
} | |
close(inChan) | |
wg.Wait() | |
close(outChan) | |
<-outputDone | |
writer.Flush() | |
return nil | |
} | |
func searchCount(stock string, year string, name string) (count string, err error) { | |
u, err := url.Parse("http://epub.cnki.net") | |
if err != nil { | |
return | |
} | |
u.Path += "/KNS/request/SearchHandler.ashx" | |
v := url.Values{} | |
v.Add("action", "") | |
v.Add("NaviCode", "*") | |
v.Add("ua", "1.21") | |
v.Add("PageName", "ASP.brief_result_aspx") | |
v.Add("DbPrefix", "CCND") | |
v.Add("DbCatalog", "中国重要报纸全文数据库") | |
v.Add("ConfigFile", "CCND.xml") | |
v.Add("db_opt", "中国重要报纸全文数据库") | |
v.Add("db_value", "中国重要报纸全文数据库") | |
v.Add("magazine_value1", "中国证券报+上海证券报+证券时报+证券日报") | |
v.Add("magazine_special1", "=") | |
v.Add("publishdate_from", year+"-01-01") | |
v.Add("publishdate_to", year+"-12-31") | |
v.Add("au_1_sel", "AU") | |
v.Add("au_1_special1", "=") | |
v.Add("txt_1_sel", "FT") | |
v.Add("txt_1_value1", stock) | |
v.Add("txt_1_value2", name) | |
v.Add("txt_1_relation", "#CNKI_OR") | |
v.Add("txt_1_special1", "%") | |
v.Add("his", "0") | |
const layout = "Mon Jan 02 2006 15:04:05 GMT+0800 (CST)" | |
t := time.Now() | |
v.Add("__", t.Format(layout)) | |
u.RawQuery = v.Encode() | |
jar, err := cookiejar.New(nil) | |
if err != nil { | |
return | |
} | |
c := &http.Client{nil, nil, jar} | |
resp, err := c.Get(u.String()) | |
if err != nil { | |
return | |
} | |
b, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return | |
} | |
resp.Body.Close() | |
resp, err = c.Get("http://epub.cnki.net/kns/brief/brief.aspx?pagename=" + string(b)) | |
if err != nil { | |
return | |
} | |
b, err = ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return | |
} | |
re := regexp.MustCompile(" 找到 (\\d+) 条结果 ") | |
resp.Body.Close() | |
searchResult := re.FindSubmatch(b) | |
count = string(searchResult[1]) | |
return | |
} | |
func main() { | |
err := processData("testdata.csv", "output.csv") | |
if err != nil { | |
panic(err) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
000012 | 2010 | 南 玻A | 71237587 | 59649365 | 74 | |||
---|---|---|---|---|---|---|---|---|
000012 | 2011 | 南 玻A | 97496836 | 84435273 | 88 | |||
000014 | 2010 | 沙河股份 | 12821932.22 | 12821932.22 | 14 | |||
000014 | 2011 | 沙河股份 | 9785185.39 | 9785185.39 | 10 | |||
000020 | 2010 | 深华发A | 1939765 | 1201976.99 | 8 | |||
000020 | 2011 | 深华发A | 5926290.77 | 4925343.09 | 2 | |||
000020 | 2012 | 深华发A | 6449675.03 | 5142319.36 | 17 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment