class VectorSimilarities(MRJob): def steps(self): return [self.mr(self.input, self.group_by_user_rating), self.mr(None, self.count_ratings_users_freq), self.mr(self.pairwise_items, self.calculate_similarity), self.mr(self.calculate_ranking, self.top_similar_items) ] def configure_options(self): super(VectorSimilarities, self).configure_options() self.add_passthrough_option( '--priorcount', dest='prior_count', default=10, type='int', help='PRIOR_COUNT: Parameter to regularize correlation') self.add_passthrough_option( '--priorcorrelation', dest='prior_correlation', default=0, type='int', help='PRIOR_CORRELATION: Parameter to regularize correlation') self.add_passthrough_option( '--minraters', dest='min_num_raters', default=3, type='int', help='the minimum number of raters') self.add_passthrough_option( '--maxraters', dest='max_num_raters', default=10000, type='int', help='the maximum number of raters') self.add_passthrough_option( '--minintersec', dest='min_intersection', default=0, type='int', help='the minimum intersection') def input(self, key, line): ''' Subclasses should override this to define their own input ''' raise NotImplementedError('Implement this in the subclass') ...