phyous/build_corpus.rb

## build_corpus.rb
search_seed = ['term1','term2'...'ternN'] # implement this as a queue

sentence_count = 0;
sentences = []
while(sentence_count < 150000){
  # get search term to use for this iteration
	term = initial_seed.dequeue()

	# Given a search term, get related sentences
	new_sentences = getBingSentences(term)
	# Given a term, get related terms
	new_terms = getRelatedTerms(term)

	# increment sentence count
	sentence_count += new_sentences.length

	# Add new sentences to corpus
	sentences += new_sentences
}

# done
return sentences

#----- Helper Methods -----
# Given a search term get related sentences by sampling a couple of phrases from each of the top bing search results
def getBingSentences(term, n)
	# 1- Get bing search results for each
	# 2- For each of the top N results curl -O the page
	# 3- Run page html thorugh html extractor (jsoup or tika or... etc )
	# 4- Using a regex for sentences (longer than say... 80 characters) find sentences.

end

# Given a term get related top n terms
def getRelatedTerms(term, n)
	# 1- Get bing search results for each
	# 2- For each of the top N results curl -O the page
	# 3- using tf-idf, skipping stop words, get top terms on each document
end
	search_seed = ['term1','term2'...'ternN'] # implement this as a queue

	sentence_count = 0;
	sentences = []
	while(sentence_count < 150000){
	# get search term to use for this iteration
	term = initial_seed.dequeue()

	# Given a search term, get related sentences
	new_sentences = getBingSentences(term)
	# Given a term, get related terms
	new_terms = getRelatedTerms(term)

	# increment sentence count
	sentence_count += new_sentences.length

	# Add new sentences to corpus
	sentences += new_sentences
	}

	# done
	return sentences

	#----- Helper Methods -----
	# Given a search term get related sentences by sampling a couple of phrases from each of the top bing search results
	def getBingSentences(term, n)
	# 1- Get bing search results for each
	# 2- For each of the top N results curl -O the page
	# 3- Run page html thorugh html extractor (jsoup or tika or... etc )
	# 4- Using a regex for sentences (longer than say... 80 characters) find sentences.

	end

	# Given a term get related top n terms
	def getRelatedTerms(term, n)
	# 1- Get bing search results for each
	# 2- For each of the top N results curl -O the page
	# 3- using tf-idf, skipping stop words, get top terms on each document
	end