Skip to content

Instantly share code, notes, and snippets.

@yyano
Last active December 18, 2017 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yyano/07c1b6c9b971c03ffbd6a0c4b2d30f74 to your computer and use it in GitHub Desktop.
Save yyano/07c1b6c9b971c03ffbd6a0c4b2d30f74 to your computer and use it in GitHub Desktop.
wikipediaのダンプXMLのpages-articles.xmlをpageごとにjsonファイルとして出力する
package main
import (
"bufio"
"encoding/json"
"encoding/xml"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
)
type Text struct {
Space string `xml:"space,attr"`
Text string `xml:",chardata"`
}
type Redirect struct {
Title string `xml:"title,attr"`
}
type Revision struct {
Id int `xml:"id"`
Parentid int `xml:"parentid"`
Timestamp string `xml:"timestamp"`
Minor string `xml:"minor"`
Comment string `xml:"comment"`
Model string `xml:"model"`
Format string `xml:"format"`
Text Text `xml:"text"`
Sha1 string `xml:"sha1"`
}
type XmlPage struct {
Title string `xml:"title"`
Ns int `xml:"ns"`
Id int `xml:"id"`
Redirect Redirect `xml:"redirect"`
Restrictions string `xml:"restrictions"`
Revision Revision `xml:"revision"`
}
func main() {
xmlfile, err := os.Open("./test.xml")
if err != nil {
log.Println("Error opening file:", err)
os.Exit(1)
}
defer xmlfile.Close()
scanner := bufio.NewScanner(xmlfile)
strSection := ""
for scanner.Scan() {
steLine := scanner.Text()
if " <page>" == steLine {
strSection = ""
}
strSection = strSection + steLine + "\n"
if " </page>" == steLine {
convertJson(strSection)
}
}
}
func convertJson(xmlData string) {
var page XmlPage
err := xml.Unmarshal([]byte(xmlData), &page)
if err != nil {
fmt.Printf("error A: %v", err)
}
jsonBytes, err := json.Marshal(page)
if err != nil {
log.Println("JSON Marshal error:", err)
return
}
filename := fmt.Sprintf("%s_%08d.json", "page", page.Id)
jsonfilename := filepath.Join("./pages", filename)
ioutil.WriteFile(jsonfilename, jsonBytes, os.ModePerm)
log.Println(jsonfilename, page.Title)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment