-
-
Save kylemarkwilliams/2b8e32a98b379d968c8d to your computer and use it in GitHub Desktop.
A Python client for SimSeerX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2.7 | |
""" | |
@author: Kyle Williams <kwilliams@psu.edu> | |
This is a client for SimSeerX. | |
It can easily be used to submit documents and retrieve similar files from SimSeerX via the RESTful interface. | |
""" | |
import requests | |
class SimSeerXClient: | |
def __init__(self, url='http://simseerx.ist.psu.edu', academic=0): | |
self.url = url | |
self.academic = academic | |
def set_academic(self, academic): | |
""" | |
A flag for whether or not the input documents should be treated as academic. | |
If this is set to 1, then the CiteSeerX metadata extractors will be used to extract header information. | |
""" | |
self.academic = academic | |
def submitFile(self, file_path): | |
""" | |
Submits a new document to SimSeerX.. | |
If successful, returns a status code of 200 and a token. | |
The token uniquely identifies the subitted document in the SimSeerX system and can be used for searching. | |
If an error occurs during submission, a non-200 code and message will be returned. | |
The code and response should always be checked before proceeding. | |
""" | |
submit_form={} | |
submit_form['extraction'] = self.academic | |
file = {'file': open(file_path, 'rb') } | |
r = requests.post(str(self.url + '/submitFileForToken'), data=submit_form, files=file) | |
file['file'].close() | |
return r.status_code, r.content | |
def submitText(self, text): | |
submit_form={} | |
submit_form['text'] = text | |
r = requests.post(str(self.url + '/submitText'), data=submit_form) | |
return r.status_code, r.content | |
def search(self, token, method, option, ranking, collection, hops=1, numresults=10, divider=1): | |
""" | |
Conducts an actual search in SimSeerX. | |
The search is based on the token returned by the submit method. | |
Returns a 200 status code and XML results if search was successful. | |
""" | |
# Put some error checking in the client. Getting these values wrong doesn't break SimSeerX, but just causes it to return nothing | |
if method == 'keyphrase': | |
if option != 'text' and option != 'keyphrases': | |
return -1, "Keyphrase options are: 'keyphrases', 'text'" | |
elif method == 'simhash': | |
if int(option) > 5 and int(option) < 1: | |
return -1, "simhash options are an integer in the range [1,5]" | |
elif method == 'shingles': | |
if int(option) != 3 and int(option) != 5 and int(option) != 8: | |
return -1, "shingle options are: '3', '5', '8'" | |
else: | |
return -1, "Unsupportmed method. Methods are: 'keyphrase', 'simhash', 'shingles'" | |
r = requests.get( | |
self.url + '/search/' + token + '/' + method | |
+ '?option=' + option | |
+ '&ranking=' + ranking | |
+ '&collection=' + collection | |
+ '&hops=' + str(hops) | |
+ '&numresults=' + str(numresults) | |
+ '÷r=' + str(divider) | |
+ '&xml=1' | |
+ '&extraction=' + str(self.academic) | |
) | |
return r.status_code, r.content | |
if __name__ =='__main__': | |
# An example of use | |
client = SimSeerXClient('http://simseerx.ist.psu.edu', academic=0) #Initialize a new simseer client | |
code, response = client.submitFile('/home/kyle/Documents/projects/data/wiki_sample/354697998.txt') # Submit a document | |
# Can also use cleint.submitText(text) | |
if code == 200: #Make sure submission was successful | |
code, response = client.search(token=response, method='keyphrase', option='keyphrases', ranking='cosine', collection='Wikipedia') # Do a search based on the returned token | |
if code == 200: | |
print response | |
else: | |
print code, response | |
else: | |
print code, response | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment