Skip to content

Instantly share code, notes, and snippets.

@kampersanda
Last active February 9, 2020 10:31
Show Gist options
  • Save kampersanda/42152547908b97585e4b2fc24944f391 to your computer and use it in GitHub Desktop.
Save kampersanda/42152547908b97585e4b2fc24944f391 to your computer and use it in GitHub Desktop.
Extract keywords from AOL-query dataset
package main
import (
"bufio"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
)
func main() {
fns, err := filepath.Glob("AOL-user-ct-collection/*.txt")
if err != nil {
panic(err)
}
queries := make([]string, 0)
for _, fn := range fns {
fmt.Println("parsing... " + fn)
queries = parse(fn, queries)
}
out, err := os.Create("AOL-queries.txt")
if err != nil {
panic(err)
}
defer out.Close()
sort.Strings(queries)
out.WriteString(queries[0] + "\n")
for i, query := range queries[1:] {
if queries[i] != query {
out.WriteString(query + "\n")
}
}
}
func parse(fn string, queries []string) []string {
file, err := os.Open(fn)
if err != nil {
panic(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
scanner.Scan() // skip header
for scanner.Scan() {
txt := scanner.Text()
queries = append(queries, strings.Split(txt, "\t")[1])
}
return queries
}
wget http://www.cim.mcgill.ca/~dudek/206/Logs/AOL-user-ct-collection/aol-data.tar.bz2
tar -jxvf aol-data.tar.bz2
go run main.go
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment