Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeffThompson/6718129 to your computer and use it in GitHub Desktop.
Save jeffThompson/6718129 to your computer and use it in GitHub Desktop.
A curatorial experiment through a residency with the Internet Archive; searches the Internet Archive for a given term, downloads the first result, parsing the most frequent word.
#!/usr/bin/python
'''
MOST FREQUENT WORD SEARCH
Jeff Thompson | 2013 | www.jeffreythompson.org
A curatorial experiment through a residency with the Internet Archive.
REQUIRES:
+ Natural Language Toolkit (NLTK)
+ Internet Archive search module
'''
import internetarchive as ia
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
import os
search_term = 'test'
collection = 'gutenberg'
download_folder = 'DownloadedFiles'
file_format = '.txt'
min_word_len = 5
output_filename = search_term + '.csv'
pathway_string = search_term
# CREATE CSV IF IT DOESN'T ALREADY EXIST
if not os.path.exists(output_filename):
with open(output_filename, 'a') as csv:
csv.write('search_term,id,downloaded_file' + '\n' + search_term)
# RUN PROCESS UNTIL SOMETHING BREAKS :)
while True:
# SEARCH
search_query = search_term.lower() + ' AND (collection:' + collection + ')'
return_data = [ 'identifier' ]
print '\nsearching for "' + search_term + '"...'
search = ia.Search(search_query, return_data)
if (search.num_found > 0):
result = search.results.next()
id = result['identifier']
print '\nfound:'
print ' id: ' + id
print ' url: ' + 'http://archive.org/details/' + id
else:
print ' no search results, sorry!'
break
# DOWNLOAD
print '\ndownloading first search result...'
download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id
os.system(download_string)
downloaded_files = os.listdir(download_folder + '/' + id)
for file in downloaded_files:
if 'meta' not in file and file.endswith('.txt'):
print ' ' + file
downloaded_filename = download_folder + '/' + id + '/' + file
break
# EXTRACT WORDS AND COUNT FREQUENCY
print '\ncounting word frequencies in "' + file + '"...'
text = ''
with open(downloaded_filename) as file:
for line in file:
text += line
tokenizer = RegexpTokenizer('\w+')
words = []
for word in tokenizer.tokenize(text):
if len(word) > min_word_len:
words.append(word.lower())
freq_dist = FreqDist(words)
# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH
most_freq = []
for i, word in enumerate(freq_dist.keys()):
if word == 'project' or word == 'gutenberg': # skip, just in case
continue
most_freq.append(word)
print ' ' + str(freq_dist[word]) + ': ' + word
if i >= 10:
break
# NEXT SEARCH TERM
for term in most_freq:
if term != search_term:
search_term = term
break
print '\nnext search term: "' + search_term + '"'
print ''
pathway_string += ' > ' + search_term
# SAVE RESULTS TO FILE
with open(output_filename, 'a') as csv: # append to existing file
csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename)
# PRINT A DIVIDER AND CONTINUE
print '- ' * 20
# DONE (or broken)
print '\n' + ('- ' * 20) + '\n'
print 'resulting pathway:\n' + pathway_string
print '\nDONE!' + ('\n' * 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment