Skip to content

Instantly share code, notes, and snippets.

@liuzhe0223
Last active December 23, 2015 08:09
Show Gist options
  • Save liuzhe0223/6605957 to your computer and use it in GitHub Desktop.
Save liuzhe0223/6605957 to your computer and use it in GitHub Desktop.
获取 http://www.dmozdir.org 上分类好的域名
/*
author: liuzhe
purpose: get classified domains from www.dmozdir.org
*/
package main
import (
"net/http"
"fmt"
"io/ioutil"
"regexp"
"labix.org/v2/mgo"
)
const urlTem = "http://www.dmozdir.org/Category/?SmallPath=%d&PAGEList=%d"
type FetchTarget struct {
smallPath int8
pageList int8
category string
}
type FetchRes struct {
status string
Category string
Address string
isEnd bool
}
type CateData struct {
category string
smallPath int8
pageStart int8
pageStop int8
}
func fetch(fetchTarget FetchTarget, ch chan FetchRes) {
fetchRes := FetchRes{}
fetchRes.isEnd = false
fetchRes.Category = fetchTarget.category
url := fmt.Sprintf(urlTem, fetchTarget.smallPath, fetchTarget.pageList)
resp, err := http.Get(url)
if err != nil {
fetchRes.status = "err"
ch <- fetchRes
return
}
defer resp.Body.Close()
content, err := ioutil.ReadAll(resp.Body)
if err != nil {
fetchRes.status = "err"
ch <- fetchRes
return
}
re, _ := regexp.Compile("<address>[^\\s]*</address>")
match_list := re.FindAllString(string(content), -1)
for _, v := range math_list {
fetchRes.Address = v[9:len(v)-10]
ch <- fetchRes
}
fetchRes.isEnd = true
ch <- fetchRes
return
}
func main() {
cateDataList := []CateData{
CateData{"招聘", 84, 1, 36},
CateData{"电子商务", 110, 1, 61}
}
chList := []chan FetchRes{}
for _, cateData := range cateDataList {
for pageNu := cateData.pageStart; pageNu <= cateData.pageStop; pageNu++ {
fetchTarget := FetchTarget{cateData.smallPath, pageNu, cateData.category}
ch := make(chan FetchRes)
chList = append(chList, ch)
go fetch(fetchTarget, ch)
}
}
session, _ := mgo.Dial("localhost")
c := session.DB("corpus").C("dmoz_address")
for _, ch := range chList {
for {
fetchRes := <- ch
if fetchRes.isEnd {
break
}
c.Insert(fetchRes)
fmt.Println(fetchRes.Address)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment