Skip to content

Instantly share code, notes, and snippets.

@deliro
Created March 5, 2021 11:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deliro/3c09f0b7b0e6c82fd29ddc7d619ce9b6 to your computer and use it in GitHub Desktop.
Save deliro/3c09f0b7b0e6c82fd29ddc7d619ce9b6 to your computer and use it in GitHub Desktop.
Walk through Wikipedia link graph (close to BFS) and find path from the main page to the page contain search term
package main
import (
"crypto/tls"
"fmt"
"io/ioutil"
"log"
"net/http"
"regexp"
"strings"
"sync"
)
var (
pattern = regexp.MustCompile("<a href=\".*?\">")
titlePn = regexp.MustCompile("<title>.*?</title>")
seen = &sync.Map{}
sem = make(chan struct{}, 30)
searchTerm = regexp.MustCompile("(?i)dota 2")
requests = 0
)
func formatURL(x string) string {
x = strings.ReplaceAll(x, "<a href=\"", "")
x = strings.ReplaceAll(x, "\">", "")
firstSpace := strings.Index(x, " ")
if firstSpace != -1 {
x = x[:firstSpace]
}
x = strings.ReplaceAll(x, "\"", "")
if strings.HasPrefix(x, "//") {
return ""
}
if strings.HasPrefix(x, "/") {
return "https://ru.wikipedia.org" + x
}
if strings.HasPrefix(x, "https://ru.wikipedia.org") {
return x
}
return ""
}
func request(req *DownReq) []string {
result := make([]string, 0)
_, ok := seen.Load(req.url)
if ok {
return result
}
seen.Store(req.url, true)
resp, err := http.Get(req.url)
requests++
if err != nil {
log.Println("http error \n", " ", req.url, "\n ", err, "\n...")
return result
}
defer resp.Body.Close()
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
return result
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("parse err ", err)
return result
}
titleRaw := titlePn.Find(body)
title := strings.ReplaceAll(string(titleRaw), "<title>", "")
title = strings.ReplaceAll(title, "</title>", "")
title = strings.ReplaceAll(title, " — Википедия", "")
title = strings.ReplaceAll(title, "Википедия — свободная энциклопедия", "*")
req.Title = title
log.Println(req.Pretty())
foundTerm := searchTerm.Find(body)
if len(foundTerm) > 0 {
fmt.Println(req.Pretty())
log.Fatalf("FOUND IN %d reqs", requests)
}
matches := pattern.FindAll(body, -1)
for _, match := range matches {
strMatch := string(match)
cleanURL := formatURL(strMatch)
if cleanURL == "" {
continue
}
_, ok := seen.Load(cleanURL)
if !ok {
result = append(result, cleanURL)
}
}
return result
}
type DownReq struct {
parent *DownReq
url string
Title string
}
func (dr *DownReq) Pretty() string {
parents := make([]string, 0)
cur := dr
for cur != nil {
parents = append(parents, cur.Title)
cur = cur.parent
}
for i := len(parents)/2 - 1; i >= 0; i-- {
opp := len(parents) - 1 - i
parents[i], parents[opp] = parents[opp], parents[i]
}
return strings.Join(parents, " > ")
}
func worker(q chan *DownReq) {
wg := &sync.WaitGroup{}
for req := range q {
sem <- struct{}{}
wg.Add(1)
if (requests % 1000) == 0 {
log.Println(requests, " requests made")
}
go func(req *DownReq) {
newUrls := request(req)
go func() {
for _, url := range newUrls {
_, ok := seen.Load(url)
if !ok {
newReq := &DownReq{parent: req, url: url, Title: ""}
q <- newReq
}
}
}()
<-sem
wg.Done()
}(req)
}
wg.Wait()
}
func main() {
http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
download := make(chan *DownReq, 1_000_000)
startFrom := "https://ru.wikipedia.org/wiki/Заглавная_страница"
download <- &DownReq{parent: nil, url: startFrom, Title: ""}
worker(download)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment