jeffThompson/Word Frequency Search the Internet Archive

## Word Frequency Search the Internet Archive
#!/usr/bin/python

'''
MOST FREQUENT WORD SEARCH
Jeff Thompson | 2013 | www.jeffreythompson.org

A curatorial experiment through a residency with the Internet Archive.

REQUIRES:
+ Natural Language Toolkit (NLTK)
+ Internet Archive search module
'''

import internetarchive as ia
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
import os

search_term = 'test'
collection = 'gutenberg'
download_folder = 'DownloadedFiles'
file_format = '.txt'
min_word_len = 5
output_filename = search_term + '.csv'
pathway_string = search_term

# CREATE CSV IF IT DOESN'T ALREADY EXIST
if not os.path.exists(output_filename):
	with open(output_filename, 'a') as csv:
		csv.write('search_term,id,downloaded_file' + '\n' + search_term)


# RUN PROCESS UNTIL SOMETHING BREAKS :)
while True:

	# SEARCH
	search_query = search_term.lower() + ' AND (collection:' + collection + ')'
	return_data = [ 'identifier' ]

	print '\nsearching for "' + search_term + '"...'
	search = ia.Search(search_query, return_data)
	if (search.num_found > 0):
		result = search.results.next()
		id = result['identifier']
		print '\nfound:'
		print '  id:  ' + id
		print '  url: ' + 'http://archive.org/details/' + id

	else:
		print '  no search results, sorry!'
		break


	# DOWNLOAD
	print '\ndownloading first search result...'
	download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id
	os.system(download_string)

	downloaded_files = os.listdir(download_folder + '/' + id)
	for file in downloaded_files:
		if 'meta' not in file and file.endswith('.txt'):
			print '  ' + file
			downloaded_filename = download_folder + '/' + id + '/' + file
			break


	# EXTRACT WORDS AND COUNT FREQUENCY
	print '\ncounting word frequencies in "' + file + '"...'
	text = ''
	with open(downloaded_filename) as file:
		for line in file:
			text += line
	tokenizer = RegexpTokenizer('\w+')
	words = []
	for word in tokenizer.tokenize(text):
		if len(word) > min_word_len:
			words.append(word.lower())
	freq_dist = FreqDist(words)


	# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH
	most_freq = []
	for i, word in enumerate(freq_dist.keys()):
		if word == 'project' or word == 'gutenberg':		# skip, just in case
			continue
		most_freq.append(word)
		print '  ' + str(freq_dist[word]) + ': ' + word
		if i >= 10:
			break


	# NEXT SEARCH TERM
	for term in most_freq:
		if term != search_term:
			search_term = term
			break
	print '\nnext search term: "' + search_term + '"'
	print ''
	pathway_string += ' > ' + search_term


	# SAVE RESULTS TO FILE
	with open(output_filename, 'a') as csv:		# append to existing file
		csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename)


	# PRINT A DIVIDER AND CONTINUE
	print '- ' * 20

# DONE (or broken)
print '\n' + ('- ' * 20) + '\n'
print 'resulting pathway:\n' + pathway_string
print '\nDONE!' + ('\n' * 3)
	#!/usr/bin/python

	'''
	MOST FREQUENT WORD SEARCH
	Jeff Thompson \| 2013 \| www.jeffreythompson.org

	A curatorial experiment through a residency with the Internet Archive.

	REQUIRES:
	+ Natural Language Toolkit (NLTK)
	+ Internet Archive search module
	'''

	import internetarchive as ia
	from nltk.probability import FreqDist
	from nltk.tokenize import RegexpTokenizer
	import os

	search_term = 'test'
	collection = 'gutenberg'
	download_folder = 'DownloadedFiles'
	file_format = '.txt'
	min_word_len = 5
	output_filename = search_term + '.csv'
	pathway_string = search_term

	# CREATE CSV IF IT DOESN'T ALREADY EXIST
	if not os.path.exists(output_filename):
	with open(output_filename, 'a') as csv:
	csv.write('search_term,id,downloaded_file' + '\n' + search_term)


	# RUN PROCESS UNTIL SOMETHING BREAKS :)
	while True:

	# SEARCH
	search_query = search_term.lower() + ' AND (collection:' + collection + ')'
	return_data = [ 'identifier' ]

	print '\nsearching for "' + search_term + '"...'
	search = ia.Search(search_query, return_data)
	if (search.num_found > 0):
	result = search.results.next()
	id = result['identifier']
	print '\nfound:'
	print ' id: ' + id
	print ' url: ' + 'http://archive.org/details/' + id

	else:
	print ' no search results, sorry!'
	break


	# DOWNLOAD
	print '\ndownloading first search result...'
	download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id
	os.system(download_string)

	downloaded_files = os.listdir(download_folder + '/' + id)
	for file in downloaded_files:
	if 'meta' not in file and file.endswith('.txt'):
	print ' ' + file
	downloaded_filename = download_folder + '/' + id + '/' + file
	break


	# EXTRACT WORDS AND COUNT FREQUENCY
	print '\ncounting word frequencies in "' + file + '"...'
	text = ''
	with open(downloaded_filename) as file:
	for line in file:
	text += line
	tokenizer = RegexpTokenizer('\w+')
	words = []
	for word in tokenizer.tokenize(text):
	if len(word) > min_word_len:
	words.append(word.lower())
	freq_dist = FreqDist(words)


	# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH
	most_freq = []
	for i, word in enumerate(freq_dist.keys()):
	if word == 'project' or word == 'gutenberg': # skip, just in case
	continue
	most_freq.append(word)
	print ' ' + str(freq_dist[word]) + ': ' + word
	if i >= 10:
	break


	# NEXT SEARCH TERM
	for term in most_freq:
	if term != search_term:
	search_term = term
	break
	print '\nnext search term: "' + search_term + '"'
	print ''
	pathway_string += ' > ' + search_term


	# SAVE RESULTS TO FILE
	with open(output_filename, 'a') as csv: # append to existing file
	csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename)


	# PRINT A DIVIDER AND CONTINUE
	print '- ' * 20

	# DONE (or broken)
	print '\n' + ('- ' * 20) + '\n'
	print 'resulting pathway:\n' + pathway_string
	print '\nDONE!' + ('\n' * 3)