Skip to content

Instantly share code, notes, and snippets.

@kylemarkwilliams
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylemarkwilliams/2b8e32a98b379d968c8d to your computer and use it in GitHub Desktop.
Save kylemarkwilliams/2b8e32a98b379d968c8d to your computer and use it in GitHub Desktop.
A Python client for SimSeerX
#!/usr/bin/python2.7
"""
@author: Kyle Williams <kwilliams@psu.edu>
This is a client for SimSeerX.
It can easily be used to submit documents and retrieve similar files from SimSeerX via the RESTful interface.
"""
import requests
class SimSeerXClient:
def __init__(self, url='http://simseerx.ist.psu.edu', academic=0):
self.url = url
self.academic = academic
def set_academic(self, academic):
"""
A flag for whether or not the input documents should be treated as academic.
If this is set to 1, then the CiteSeerX metadata extractors will be used to extract header information.
"""
self.academic = academic
def submitFile(self, file_path):
"""
Submits a new document to SimSeerX..
If successful, returns a status code of 200 and a token.
The token uniquely identifies the subitted document in the SimSeerX system and can be used for searching.
If an error occurs during submission, a non-200 code and message will be returned.
The code and response should always be checked before proceeding.
"""
submit_form={}
submit_form['extraction'] = self.academic
file = {'file': open(file_path, 'rb') }
r = requests.post(str(self.url + '/submitFileForToken'), data=submit_form, files=file)
file['file'].close()
return r.status_code, r.content
def submitText(self, text):
submit_form={}
submit_form['text'] = text
r = requests.post(str(self.url + '/submitText'), data=submit_form)
return r.status_code, r.content
def search(self, token, method, option, ranking, collection, hops=1, numresults=10, divider=1):
"""
Conducts an actual search in SimSeerX.
The search is based on the token returned by the submit method.
Returns a 200 status code and XML results if search was successful.
"""
# Put some error checking in the client. Getting these values wrong doesn't break SimSeerX, but just causes it to return nothing
if method == 'keyphrase':
if option != 'text' and option != 'keyphrases':
return -1, "Keyphrase options are: 'keyphrases', 'text'"
elif method == 'simhash':
if int(option) > 5 and int(option) < 1:
return -1, "simhash options are an integer in the range [1,5]"
elif method == 'shingles':
if int(option) != 3 and int(option) != 5 and int(option) != 8:
return -1, "shingle options are: '3', '5', '8'"
else:
return -1, "Unsupportmed method. Methods are: 'keyphrase', 'simhash', 'shingles'"
r = requests.get(
self.url + '/search/' + token + '/' + method
+ '?option=' + option
+ '&ranking=' + ranking
+ '&collection=' + collection
+ '&hops=' + str(hops)
+ '&numresults=' + str(numresults)
+ '&divider=' + str(divider)
+ '&xml=1'
+ '&extraction=' + str(self.academic)
)
return r.status_code, r.content
if __name__ =='__main__':
# An example of use
client = SimSeerXClient('http://simseerx.ist.psu.edu', academic=0) #Initialize a new simseer client
code, response = client.submitFile('/home/kyle/Documents/projects/data/wiki_sample/354697998.txt') # Submit a document
# Can also use cleint.submitText(text)
if code == 200: #Make sure submission was successful
code, response = client.search(token=response, method='keyphrase', option='keyphrases', ranking='cosine', collection='Wikipedia') # Do a search based on the returned token
if code == 200:
print response
else:
print code, response
else:
print code, response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment