Skip to content

Instantly share code, notes, and snippets.

@rvprasad
Last active April 21, 2017 05:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvprasad/4d300fc44a7e13c28924c6682d2142f9 to your computer and use it in GitHub Desktop.
Save rvprasad/4d300fc44a7e13c28924c6682d2142f9 to your computer and use it in GitHub Desktop.
Groovy script to extract tags that co-occur with a given tag on question-type posts in Posts.xml file from Stack Overflow data dump.
/*
* Copyright (c) 2017, Venkatesh-Prasad Ranganath
*
* BSD 3-clause License
*
* Author: Venkatesh-Prasad Ranganath
*/
import groovy.util.CliBuilder
import groovyx.gpars.actor.DynamicDispatchActor
final MEMORY_LIMIT = 1.5*1024**3
final NUM_PROCS = 7
def cli = new CliBuilder(
usage:"groovy collectTagsOccurringWithGivenTag.groovy")
cli.p(longOpt:'postsFile', args:1, argName:'postsFile', required:true,
'Posts file')
cli.a(longOpt:'anchorTag', args:1, argName:'anchorTag', required:true,
'Tag with which other tags should co-occur')
def options = cli.parse(args)
if (!options) {
return
}
final anchorTag = options.anchorTag
final coOccurringTags = new HashSet()
final collector = new DynamicDispatchActor().become {
when { String[] tags -> coOccurringTags.addAll(tags) }
when { boolean i -> terminate() }
}.start()
final processors = (0..<NUM_PROCS).collect { index ->
final tagExtractionPattern = ~/.* PostTypeId="1" .* Tags="([^"]*)&lt;${anchorTag}&gt;([^"]*)" .*/
new DynamicDispatchActor().become {
when { String line ->
def tmp1 = line.replaceAll("[^\\p{Print}]", "?") // FIX: handle non-printable Unicode characters
def match1 = tagExtractionPattern.matcher(tmp1)
if (match1.matches()) {
def tags = match1.group(1) + match1.group(2)
if (tags) {
tags = tags.replaceAll("&gt;", "")
tags = tags.replaceAll("&lt;", ",")
collector.send tags[1..-1].split(',')
}
}
replyIfExists 0
}
when { boolean i -> terminate() }
}.start()
}
println("0 lines processed ${coOccurringTags.size()} ${new Date()}")
def bytes = 0L
new File(options.postsFile).eachLine { line, linenum ->
bytes += line.size()
if (bytes < MEMORY_LIMIT) {
processors[linenum % NUM_PROCS].send line
} else {
processors[linenum % NUM_PROCS].sendAndWait line
bytes = 0L
}
if (linenum % 500000 == 0) {
println("$linenum lines processed ${coOccurringTags.size()} \
${new Date()}")
}
}
processors.each { it << false }
processors*.join()
collector << false
collector.join()
def fileName = "tags-occurring-with-${anchorTag}-tag.txt"
new File(fileName).withPrintWriter { writer ->
coOccurringTags << anchorTag
coOccurringTags.sort().each { tag -> writer.println(tag) }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment