Skip to content

Instantly share code, notes, and snippets.

@defect
Created June 4, 2015 15:23
Show Gist options
  • Save defect/d19c16bb5067e2bf37f1 to your computer and use it in GitHub Desktop.
Save defect/d19c16bb5067e2bf37f1 to your computer and use it in GitHub Desktop.
import com.twitter.scalding._
class TagCountJob(args : Args) extends Job(args) {
val input = TextLine(args("input"))
val output = Tsv(args("output"), writeHeader = true)
// Parse input file and extract artist name, tag name and tag count
val pairs = input.read.mapTo(('offset, 'line) -> ('artist, 'tag, 'tag_count)) {
line : (Int, String) =>
val fields = line._2.split("<sep>")
(fields(1), fields(2), fields(3).toInt)
}
/* DF */
// Calculate total number of tags used per artist
val artistNumTags = pairs.groupBy('artist) { _.sum[Int]('tag_count -> 'doc_tagcount) }
// Calculate tf value for artist tag combo
val tf = pairs.joinWithSmaller('artist -> 'artist, artistNumTags)
.map(('tag_count, 'doc_tagcount) -> 'tf_value) { item : (Int, Int) =>
val (tag_count, doc_tagcount) = item
(tag_count.toDouble / doc_tagcount)
}.project('artist, 'tag, 'tf_value)
/* IDF */
// Total number of artists
val n = pairs
.unique('artist)
.groupAll { _.size }
.rename('size -> 'n_artists)
.project('n_artists)
// Domain frequency for each tag
val df = pairs.groupBy('tag) { group => group.size }
.rename('size -> 'df_count)
// Use n to calculate inverse domain frequency for each tag
val idf = df.crossWithTiny(n)
.mapTo(('tag, 'df_count, 'n_artists) -> ('tag, 'idf_value)) {
item : (String, Int, Int) =>
val (tag, df_count, n_artists) = item
(tag, log2(n_artists.toDouble / df_count))
}
/* TF/IDF */
// tf*idf for each artist and tag
val tfidf = tf.joinWithSmaller('tag -> 'tag, idf)
.map(('tf_value, 'idf_value) -> 'tfidf) {
item : (Double, Double) =>
val (tf_value, idf_value) = item
(tf_value * idf_value)
}.project('artist, 'tag, 'tfidf)
def log2(x: Double) = math.log10(x) / math.log10(2)
tfidf.write(output)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment