Skip to content

Instantly share code, notes, and snippets.

@ericandrewlewis
Created January 24, 2023 03:28
Show Gist options
  • Save ericandrewlewis/2fce39aff70b78b8316cfc87cd10a3eb to your computer and use it in GitHub Desktop.
Save ericandrewlewis/2fce39aff70b78b8316cfc87cd10a3eb to your computer and use it in GitHub Desktop.
package main
import (
"encoding/xml"
"fmt"
"io"
"log"
"os"
"strings"
"time"
)
// <page>
// <title>User talk:Francishooton</title>
// <ns>3</ns>
// <id>62585851</id>
// <revision>
// <id>930802627</id>
// <timestamp>2023-01-15T01:27:12Z</timestamp>
// <contributor>
// <username>MPS1992</username>
// <id>26860051</id>
// </contributor>
// <comment>Welcome to Wikipedia! ([[WP:TW|TW]])</comment>
// <model>wikitext</model>
// <format>text/x-wiki</format>
// <text bytes="1640" id="942429088" />
// <sha1>fb2sdwvu16eeeywgn7yb1zic4tp54a5</sha1>
// </revision>
// </page>
type CustomTime struct {
time.Time
}
type Revision struct {
Comment string `xml:"comment"`
Id int32 `xml:"id"`
Text string `xml:"text"`
Timestamp CustomTime `xml:"timestamp"`
}
type Page struct {
Id int32 `xml:"id"`
Title string `xml:"title"`
Revision []Revision `xml:"revision"`
Redirect *bool `xml:"redirect,omitempty"`
}
var today = time.Now()
var threeMonthsAgo = time.Now().AddDate(0, -3, 0)
func check(e error) {
if e != nil {
panic(e)
}
}
func main() {
xmlFile, err := os.Open(os.Args[1])
if err != nil {
log.Fatal(err)
}
defer xmlFile.Close()
outputFile, err := os.Create("/tmp/dat2")
if err != nil {
log.Fatal(err)
}
defer outputFile.Close()
d := xml.NewDecoder(xmlFile)
pagesMatched := 0
pagesProcessed := 0
fmt.Printf("%s - Start\n", time.Now().Format(time.UnixDate))
for {
tok, err := d.Token()
if tok == nil || err == io.EOF {
// EOF means we're done.
break
} else if err != nil {
log.Fatalf("Error decoding token: %s", err)
}
switch ty := tok.(type) {
case xml.StartElement:
if ty.Name.Local == "page" {
pagesProcessed++
// If this is a start element named "page", parse this element
// fully.
var p Page
if err = d.DecodeElement(&p, &ty); err != nil {
log.Fatalf("Error decoding item: %s", err)
}
if p.Redirect != nil {
continue
}
if strings.HasPrefix(p.Title, "User:") {
continue
}
if strings.HasPrefix(p.Title, "User talk:") {
continue
}
var earliestRevisionDate = today
var earliestRevisionIndex = 0
for i := 0; i < len(p.Revision); i++ {
r := p.Revision[i]
if r.Timestamp.Time.Before(earliestRevisionDate) {
earliestRevisionDate = r.Timestamp.Time
earliestRevisionIndex = i
}
}
if earliestRevisionDate.Before(threeMonthsAgo) {
continue
}
r := p.Revision[earliestRevisionIndex]
if strings.HasPrefix(r.Text, "#REDIRECT") {
continue
}
if strings.HasPrefix(r.Text, "#redirect") {
continue
}
if strings.HasPrefix(r.Comment, "/* top */Adding redirect") {
continue
}
if strings.HasPrefix(r.Comment, "Redirecting to") {
continue
}
if strings.HasPrefix(r.Comment, "[[WP:AES|←]]Redirected page to ") {
continue
}
var textSnippet string
if len(r.Text) > 20 {
textSnippet = r.Text[0:20]
} else {
textSnippet = r.Text
}
bytesToWrite := []byte(p.Title + " - " + textSnippet + "\n")
_, err := outputFile.Write(bytesToWrite)
pagesMatched++
check(err)
}
default:
}
}
fmt.Printf("%s - processed %v, found %v\n", time.Now().Format(time.UnixDate), pagesProcessed, pagesMatched)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment