Instantly share code, notes, and snippets.

Embed
What would you like to do?
Navi's Wikipedia Bot
package main
import (
"fmt"
"strings"
"strconv"
"encoding/json"
"net/http"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
"euler-data-processor/wikidatastruct"
"euler-data-processor/crawlerstruct"
)
func getJson(url string, target interface{}) error {
r, err := http.Get(url)
r.Add("User-Agent", `naviWikiBot/1.0 (http://usenavi.com/; tech@usenavi.com)`)
if err != nil {
return err
}
defer r.Body.Close()
return json.NewDecoder(r.Body).Decode(target)
}
func main() {
session, _ := mgo.Dial("mongodb://localhost:27017")
itemsCollection := session.DB("WikiData").C("items")
entitiesCollection := session.DB("WikiData").C("entities")
items := []wikidatastruct.Entity{}
itemsCollection.Find(bson.M{}).Limit(50).Sort("_id").Select(bson.M{"sitelinks.enwiki.title": 1}).All(&items)
for len(items) > 0 {
var err error
jsonURL := "https://en.wikipedia.org/w/api.php?action=query&format=json&formatversion=2&titles="
apiData := crawlerstruct.Wiki{}
for _, item := range items {
if sitelink, ok := item.Sitelinks.(bson.M)["enwiki"]; ok {
jsonURL += "|" + strings.Replace(strings.Replace(sitelink.(bson.M)["title"].(string), " ", "%20", -1), "&", "%26", -1)
}
}
jsonURL = strings.Replace(jsonURL, "&titles=|", "&titles=", 1)
jsonURL += "&prop=revisions|links|extlinks&rvprop=content&pllimit=max&ellimit=max"
continuedJsonURL := jsonURL
for !apiData.Batchcomplete {
apiData = crawlerstruct.Wiki{}
err = getJson(continuedJsonURL, &apiData)
if err != nil {
fmt.Println("Wikipedia API Error:", err)
fmt.Println("JsonURL:", continuedJsonURL)
} else {
if apiData.Error["code"] != "" {
for errorKey, errorValue := range apiData.Error {
fmt.Println("ERROR:", errorKey, ":", errorValue)
fmt.Println("JsonURL:", continuedJsonURL)
}
}
if apiData.Warnings["main"]["warnings"] != "" {
fmt.Println("MAIN WARNNING:", apiData.Warnings["main"]["warnings"])
fmt.Println("JsonURL:", continuedJsonURL)
}
if apiData.Warnings["query"]["warnings"] != "" {
fmt.Println("QUERY WARNNING:", apiData.Warnings["query"]["warnings"])
fmt.Println("JsonURL:", continuedJsonURL)
}
continuedJsonURL = jsonURL
for continueKey, continueValue := range apiData.Continue {
if continueKey == "eloffset" {
continuedJsonURL += "&" + continueKey + "=" + strconv.FormatFloat(continueValue.(float64), 'f', -1, 64)
} else {
continuedJsonURL += "&" + continueKey + "=" + strings.Replace(continueValue.(string), "&", "%26", -1)
}
}
for _, page := range apiData.Query.Pages {
for itemKey, item := range items {
if sitelink, ok := item.Sitelinks.(bson.M)["enwiki"]; ok && sitelink.(bson.M)["title"].(string) == page.Title {
if len(page.Revisions) > 0 && page.Revisions[0].Content != "" {
items[itemKey].Wiki_en_markup += page.Revisions[0].Content
items[itemKey].Wiki_en_markup_length += float64(len([]rune(page.Revisions[0].Content)))
}
if len(page.Links) > 0 {
for _, link := range page.Links {
items[itemKey].Wikipedia_edges = append(items[itemKey].Wikipedia_edges[:], wikidatastruct.Edge{Subject: page.Title, Object: link.Title})
}
}
if len(page.Extlinks) > 0 {
for _, link := range page.Extlinks {
items[itemKey].Wikipedia_external_links = append(items[itemKey].Wikipedia_external_links[:], wikidatastruct.Wikipedia_external_link{Url: link.Url})
}
}
break
}
}
}
}
}
for _, item := range items {
updateData := bson.M{ "$set": bson.M{
"wiki_en_markup": item.Wiki_en_markup,
"wiki_en_markup_length": item.Wiki_en_markup_length },
"$addToSet": bson.M{
"wikipedia_edges": bson.M{ "$each": item.Wikipedia_edges },
"wikipedia_external_links": bson.M{ "$each": item.Wikipedia_external_links } } }
err = itemsCollection.UpdateId(item.Id, updateData)
if err != nil {
fmt.Println(item.Id, "- Item collection update error:", err)
}
err = entitiesCollection.UpdateId("E" + item.Id, updateData)
if err != nil {
fmt.Println(item.Id, "- Entity collection update error:", err)
}
fmt.Println("Updated item:", item.Id)
}
itemsCollection.Find(bson.M{ "_id": bson.M{ "$gt": items[len(items)-1].Id } }).Limit(50).Sort("_id").Select(bson.M{"sitelinks.enwiki.title": 1}).All(&items)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment