-
-
Save ericandrewlewis/2fce39aff70b78b8316cfc87cd10a3eb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/xml" | |
"fmt" | |
"io" | |
"log" | |
"os" | |
"strings" | |
"time" | |
) | |
// <page> | |
// <title>User talk:Francishooton</title> | |
// <ns>3</ns> | |
// <id>62585851</id> | |
// <revision> | |
// <id>930802627</id> | |
// <timestamp>2023-01-15T01:27:12Z</timestamp> | |
// <contributor> | |
// <username>MPS1992</username> | |
// <id>26860051</id> | |
// </contributor> | |
// <comment>Welcome to Wikipedia! ([[WP:TW|TW]])</comment> | |
// <model>wikitext</model> | |
// <format>text/x-wiki</format> | |
// <text bytes="1640" id="942429088" /> | |
// <sha1>fb2sdwvu16eeeywgn7yb1zic4tp54a5</sha1> | |
// </revision> | |
// </page> | |
type CustomTime struct { | |
time.Time | |
} | |
type Revision struct { | |
Comment string `xml:"comment"` | |
Id int32 `xml:"id"` | |
Text string `xml:"text"` | |
Timestamp CustomTime `xml:"timestamp"` | |
} | |
type Page struct { | |
Id int32 `xml:"id"` | |
Title string `xml:"title"` | |
Revision []Revision `xml:"revision"` | |
Redirect *bool `xml:"redirect,omitempty"` | |
} | |
var today = time.Now() | |
var threeMonthsAgo = time.Now().AddDate(0, -3, 0) | |
func check(e error) { | |
if e != nil { | |
panic(e) | |
} | |
} | |
func main() { | |
xmlFile, err := os.Open(os.Args[1]) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer xmlFile.Close() | |
outputFile, err := os.Create("/tmp/dat2") | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer outputFile.Close() | |
d := xml.NewDecoder(xmlFile) | |
pagesMatched := 0 | |
pagesProcessed := 0 | |
fmt.Printf("%s - Start\n", time.Now().Format(time.UnixDate)) | |
for { | |
tok, err := d.Token() | |
if tok == nil || err == io.EOF { | |
// EOF means we're done. | |
break | |
} else if err != nil { | |
log.Fatalf("Error decoding token: %s", err) | |
} | |
switch ty := tok.(type) { | |
case xml.StartElement: | |
if ty.Name.Local == "page" { | |
pagesProcessed++ | |
// If this is a start element named "page", parse this element | |
// fully. | |
var p Page | |
if err = d.DecodeElement(&p, &ty); err != nil { | |
log.Fatalf("Error decoding item: %s", err) | |
} | |
if p.Redirect != nil { | |
continue | |
} | |
if strings.HasPrefix(p.Title, "User:") { | |
continue | |
} | |
if strings.HasPrefix(p.Title, "User talk:") { | |
continue | |
} | |
var earliestRevisionDate = today | |
var earliestRevisionIndex = 0 | |
for i := 0; i < len(p.Revision); i++ { | |
r := p.Revision[i] | |
if r.Timestamp.Time.Before(earliestRevisionDate) { | |
earliestRevisionDate = r.Timestamp.Time | |
earliestRevisionIndex = i | |
} | |
} | |
if earliestRevisionDate.Before(threeMonthsAgo) { | |
continue | |
} | |
r := p.Revision[earliestRevisionIndex] | |
if strings.HasPrefix(r.Text, "#REDIRECT") { | |
continue | |
} | |
if strings.HasPrefix(r.Text, "#redirect") { | |
continue | |
} | |
if strings.HasPrefix(r.Comment, "/* top */Adding redirect") { | |
continue | |
} | |
if strings.HasPrefix(r.Comment, "Redirecting to") { | |
continue | |
} | |
if strings.HasPrefix(r.Comment, "[[WP:AES|←]]Redirected page to ") { | |
continue | |
} | |
var textSnippet string | |
if len(r.Text) > 20 { | |
textSnippet = r.Text[0:20] | |
} else { | |
textSnippet = r.Text | |
} | |
bytesToWrite := []byte(p.Title + " - " + textSnippet + "\n") | |
_, err := outputFile.Write(bytesToWrite) | |
pagesMatched++ | |
check(err) | |
} | |
default: | |
} | |
} | |
fmt.Printf("%s - processed %v, found %v\n", time.Now().Format(time.UnixDate), pagesProcessed, pagesMatched) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment