Created
June 4, 2015 15:23
-
-
Save defect/d19c16bb5067e2bf37f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.twitter.scalding._ | |
class TagCountJob(args : Args) extends Job(args) { | |
val input = TextLine(args("input")) | |
val output = Tsv(args("output"), writeHeader = true) | |
// Parse input file and extract artist name, tag name and tag count | |
val pairs = input.read.mapTo(('offset, 'line) -> ('artist, 'tag, 'tag_count)) { | |
line : (Int, String) => | |
val fields = line._2.split("<sep>") | |
(fields(1), fields(2), fields(3).toInt) | |
} | |
/* DF */ | |
// Calculate total number of tags used per artist | |
val artistNumTags = pairs.groupBy('artist) { _.sum[Int]('tag_count -> 'doc_tagcount) } | |
// Calculate tf value for artist tag combo | |
val tf = pairs.joinWithSmaller('artist -> 'artist, artistNumTags) | |
.map(('tag_count, 'doc_tagcount) -> 'tf_value) { item : (Int, Int) => | |
val (tag_count, doc_tagcount) = item | |
(tag_count.toDouble / doc_tagcount) | |
}.project('artist, 'tag, 'tf_value) | |
/* IDF */ | |
// Total number of artists | |
val n = pairs | |
.unique('artist) | |
.groupAll { _.size } | |
.rename('size -> 'n_artists) | |
.project('n_artists) | |
// Domain frequency for each tag | |
val df = pairs.groupBy('tag) { group => group.size } | |
.rename('size -> 'df_count) | |
// Use n to calculate inverse domain frequency for each tag | |
val idf = df.crossWithTiny(n) | |
.mapTo(('tag, 'df_count, 'n_artists) -> ('tag, 'idf_value)) { | |
item : (String, Int, Int) => | |
val (tag, df_count, n_artists) = item | |
(tag, log2(n_artists.toDouble / df_count)) | |
} | |
/* TF/IDF */ | |
// tf*idf for each artist and tag | |
val tfidf = tf.joinWithSmaller('tag -> 'tag, idf) | |
.map(('tf_value, 'idf_value) -> 'tfidf) { | |
item : (Double, Double) => | |
val (tf_value, idf_value) = item | |
(tf_value * idf_value) | |
}.project('artist, 'tag, 'tfidf) | |
def log2(x: Double) = math.log10(x) / math.log10(2) | |
tfidf.write(output) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment