Last active October 5, 2019 20:05
Parsing of big XML files using stream of tokens
// Parser is a command line tool for parsing big XML file.
// Author: Ali Shanaakh <>
// Usage: go run parse.go -path=./15-ufop/15.1-EX_XML_EDR_UO_03.10.2019.xml
package main
import (
type Founder struct {
Founder string `xml:"FOUNDER"`
type Record struct {
EDRPOU string `xml:"EDRPOU"`
KVED string `xml:"KVED"`
Boss string `xml:"BOSS"`
Stan string `xml:"STAN"`
ShortName string `xml:"SHORT_NAME"`
Name string `xml:"NAME"`
Address string `xml:"ADDRESS"`
FoundingDocumentNum string `xml:"FOUNDING_DOCUMENT_NUM"`
Founders []Founder `xml:"FOUNDERS"`
var (
path = flag.String("path", "", "Path to XML file")
func windows1251(charset string, input io.Reader) (io.Reader, error) {
switch charset {
case "windows-1251":
return charmap.Windows1251.NewDecoder().Reader(input), nil
return nil, fmt.Errorf("unknown charset: %s", charset)
func main() {
start := time.Now()
f, err := os.Open(*path)
if err != nil {
stats := make(map[string]int)
decoder := xml.NewDecoder(f)
decoder.CharsetReader = windows1251
for {
// Read tokens from the XML document in a stream.
t, _ := decoder.Token()
if t == nil {
// Inspect the type of the token just read.
switch token := t.(type) {
case xml.StartElement:
if token.Name.Local == "RECORD" {
var record Record
decoder.DecodeElement(&record, &token)
for k, v := range stats {
log.Println(k, v)
log.Println("Time of execution", time.Since(start))
