hardik-vala/sol_to_davids_cl_puzzler.sc

## sol_to_davids_cl_puzzler.sc
import java.io.File

/*
 * Problem description: Given a .tsv file where the first column is a tweet’s text content, the second
 * column is the user id who made the tweet, and the third column is the lat lon of the tweet (if any),
 * with a space in between the values how would you find the most frequently used hashtag for each user
 * who has tweeted more than five times and has at least one tweet from in the continental US?
 */

val inputPath: String = "/home/ndg/project/jurgens/command-line-demo/command-line-challenge-input.BIG.tsv"

io.Source.fromFile(new File(inputPath))
  // Get an iterator over the rows in the .tsv.
  .getLines
  // Map each row into an Array with the .tsv entries.
  .map(_.split("\t"))
  // Filter out rows without a user Id or an empty user Id.
  .filterNot(_.size < 2)
  // Filter our rows with an empty user Id.
  .filterNot(_(1).trim == "")
  // Convert to list so we can call groupBy.
  .toList
  // Group the rows according to user Id.
  .groupBy(_(1))
  // Filter users with more than 5 tweets.
  .filter(_._2.size > 5)
  // Filter users with at least one tweet in the continental U.S.
  .filter(_._2.exists(v => v.size > 2 &&
                            24.3115 < v(2).split(" ")(0).toDouble &&
                            v(2).split(" ")(0).toDouble < 49.2341 &&
                            -124.626080 < v(2).split(" ")(1).toDouble &&
                            v(2).split(" ")(1).toDouble < -62.361014))
  // Map the rows for each user to the user's most used hashtag.
  .mapValues(_
    // Map each row to the list of hashtags used in the tweet text.
    // (Tokenization performed using single space separation).
    .map(_(0).split(" ").filter(_.startsWith("#")))
    // Combine all the hashtags used by a user in their invidiual
    // tweets.
    .flatten
    // Group the hashtags by unique hashtags.
    .groupBy(ht => ht)
    // Map the hashtag groupings to the number of occurrences.
    .mapValues(_.size)
    // Get the highest occurrence-hashtag pair (If there's a tie,
    // then the one occurring last, lexicographically, is returned.
    .maxBy(_.swap)
    // Get the most-occurring hashtag.
    ._1)
	import java.io.File

	/*
	* Problem description: Given a .tsv file where the first column is a tweet’s text content, the second
	* column is the user id who made the tweet, and the third column is the lat lon of the tweet (if any),
	* with a space in between the values how would you find the most frequently used hashtag for each user
	* who has tweeted more than five times and has at least one tweet from in the continental US?
	*/

	val inputPath: String = "/home/ndg/project/jurgens/command-line-demo/command-line-challenge-input.BIG.tsv"

	io.Source.fromFile(new File(inputPath))
	// Get an iterator over the rows in the .tsv.
	.getLines
	// Map each row into an Array with the .tsv entries.
	.map(_.split("\t"))
	// Filter out rows without a user Id or an empty user Id.
	.filterNot(_.size < 2)
	// Filter our rows with an empty user Id.
	.filterNot(_(1).trim == "")
	// Convert to list so we can call groupBy.
	.toList
	// Group the rows according to user Id.
	.groupBy(_(1))
	// Filter users with more than 5 tweets.
	.filter(_._2.size > 5)
	// Filter users with at least one tweet in the continental U.S.
	.filter(_._2.exists(v => v.size > 2 &&
	24.3115 < v(2).split(" ")(0).toDouble &&
	v(2).split(" ")(0).toDouble < 49.2341 &&
	-124.626080 < v(2).split(" ")(1).toDouble &&
	v(2).split(" ")(1).toDouble < -62.361014))
	// Map the rows for each user to the user's most used hashtag.
	.mapValues(_
	// Map each row to the list of hashtags used in the tweet text.
	// (Tokenization performed using single space separation).
	.map(_(0).split(" ").filter(_.startsWith("#")))
	// Combine all the hashtags used by a user in their invidiual
	// tweets.
	.flatten
	// Group the hashtags by unique hashtags.
	.groupBy(ht => ht)
	// Map the hashtag groupings to the number of occurrences.
	.mapValues(_.size)
	// Get the highest occurrence-hashtag pair (If there's a tie,
	// then the one occurring last, lexicographically, is returned.
	.maxBy(_.swap)
	// Get the most-occurring hashtag.
	._1)