Skip to content

Instantly share code, notes, and snippets.

@pjox
Created June 7, 2022 13:38
Show Gist options
  • Save pjox/54e6a176679a1fc8cb653048bb4dd737 to your computer and use it in GitHub Desktop.
Save pjox/54e6a176679a1fc8cb653048bb4dd737 to your computer and use it in GitHub Desktop.
Count the Number of Documents in OSCAR 21.09 for a given language
package main
import (
"bufio"
"compress/gzip"
"errors"
"fmt"
"log"
"os"
"path/filepath"
"sync"
)
type numDocs struct {
mux sync.Mutex
docs int
}
func (nd *numDocs) sumDocs(fileDocs int) {
nd.mux.Lock()
defer nd.mux.Unlock()
nd.docs += fileDocs
return
}
func extractNCount(path string, numdocs *numDocs) error {
count := 0
//open gzip file
fi, err := os.Open(path)
if err != nil {
return err
}
defer fi.Close()
fz, err := gzip.NewReader(fi)
if err != nil {
return err
}
defer fz.Close()
bufin := bufio.NewReader(fz)
for line, err := bufin.ReadString('\n'); err == nil; line, err = bufin.ReadString('\n') {
if line == "\n" {
count++
}
}
//There is a double \n at the end of the file
count -= 1
fmt.Println(path)
fmt.Println(count)
numdocs.sumDocs(count)
return nil
}
func walkFiles(done <-chan struct{}, root string) (<-chan string, <-chan error) {
paths := make(chan string)
errc := make(chan error, 1)
go func() { // HL
// Close the paths channel after Walk returns.
defer close(paths) // HL
// No select needed for this send, since errc is buffered.
errc <- filepath.Walk(root, func(path string, info os.FileInfo, err error) error { // HL
if err != nil {
return err
}
if !info.Mode().IsRegular() {
return nil
}
// Pay attention to where you put the binary
if match, _ := filepath.Match("*/*.txt.gz", path); !match {
return nil
}
select {
case paths <- path: // HL
case <-done: // HL
return errors.New("walk canceled")
}
return nil
})
}()
return paths, errc
}
func main() {
done := make(chan struct{})
defer close(done)
paths, errc := walkFiles(done, os.Args[1])
var wg sync.WaitGroup
maxGoroutines := 60
guard := make(chan struct{}, maxGoroutines)
numdocs := numDocs{}
for path := range paths {
wg.Add(1)
go func(path string) {
guard <- struct{}{}
err := extractNCount(path, &numdocs)
if err != nil {
log.Fatalln(err)
}
<-guard
wg.Done()
}(path)
}
// Check whether the Walk failed.
if err := <-errc; err != nil { // HLerrc
log.Fatal(err)
}
wg.Wait()
fmt.Printf("\n%d\n", numdocs.docs)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment