Skip to content

Instantly share code, notes, and snippets.

@alangampel
Created June 2, 2014 14:34
Show Gist options
  • Save alangampel/7a46df8cb92dfe821b8e to your computer and use it in GitHub Desktop.
Save alangampel/7a46df8cb92dfe821b8e to your computer and use it in GitHub Desktop.
Third program: tokenizes text from a URL and then searches Google in groups of X number of words.
import urllib
import urllib.request
import urllib.parse
import json
print('\n')
print('----------------------------------')
print('Searching for internet matches')
print('----------------------------------')
# open file with greek content
print('\n')
url_str = input('File URL? ')
response = urllib.request.urlopen(url_str).read()
response_str = response.decode("utf-8")
# buffer all text in the file
text_buf = ''
for line in response_str:
text_buf += line
# tokenize
tokens = text_buf.split();
print('\n')
print('There are ' + str(len(tokens)) + ' tokens in the file')
print('All tokens from file:\n')
print(tokens)
print('\n')
# create string of x tokens
token_count_str = input('How many words in search phrase? ')
token_count = int(token_count_str)
print('\n')
# create index into text tokens
token_idx = 0
num_tokens = len(tokens)
loop_termination_value = num_tokens - token_count
while token_idx < loop_termination_value:
#delineate new query
print ('-----------------------------------------------------------')
print("Search number " + str (token_idx + 1))
# concatenate the search string contents, i.e. the token_count words fromt the file
i = 0
content_str = ''
while i < token_count:
content_str += tokens[i + token_idx] + ' '
i += 1
print("Content string is:")
print(content_str)
print('')
# build the URL
url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
query = url+content_str
query = urllib.parse.urlencode({'q':content_str})
print("URL query is: ")
print('')
print(url + query)
print('')
# get the response from google
response = urllib.request.urlopen(url+query).read()
json_data = json.loads(response.decode("utf-8"))
print ("JSON data: ")
print (json_data)
data_present_test = json_data['responseData']
if str(data_present_test) == 'None':
print("No data for this query")
token_idx += 1
continue
results = json_data['responseData']['results']
# format the response and print it
indx = 0
for result in results:
title = result['title']
url = result['url']
print("JSON result " , indx + 1, ':')
print ( title + '; ' + url )
print('')
indx += 1
print('\n')
# increment token_idx so as to move to the next chunk of text in the file
token_idx += 1
# end of while token_idx < loop_termination_value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment