Skip to content

Instantly share code, notes, and snippets.

@semihamakinist
Last active August 8, 2016 11:27
Show Gist options
  • Save semihamakinist/a4bedb1d2ff37bf8f92d910a72371e8d to your computer and use it in GitHub Desktop.
Save semihamakinist/a4bedb1d2ff37bf8f92d910a72371e8d to your computer and use it in GitHub Desktop.
solrSimServer.py
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 19 12:04:03 2016
@author: Semiha
"""
"""
how to run with indexing data on Solr. This code, it is answer.
"""
from simserver import SessionServer
from gensim import utils
import os, io, sys
from nltk.corpus import stopwords
import solr
reload(sys)
sys.setdefaultencoding('utf-8') #cp1254
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
server = SessionServer('sessionSolrServer') # resume server (or create a new one)
def solrReadFile(start=0):
corpusList=[]
url="http://127.0.0.1:8983/solr/core1"#localhost/ip:port
conn=solr.SolrConnection(url=url)
response=conn.query('*', rows =1)#for finding the total number of data on the solr
searched=conn.query('*', rows =response.numFound)
i=start
for result in searched.results:
try:
categories=result["categories"][0].lower()#solr file key field
text_tmp=result["features"][0].lower().split("\n") #solr file key field for text
sentences=[]
for sentence in text_tmp:
control=sentence.replace(" ","")
if control == "":
pass
else:
sentences.append(sentence)
text=' '.join(sntc for sntc in sentences)
text=text.replace(' stream_size null x-parsed-by org.apache.tika.parser.defaultparser x-parsed-by org.apache.tika.parser.txt.txtparser stream_content_type application/txt content-encoding utf-8 content-type text/plain; charset=utf-8 ','')
tr_stopwords=stopwords.words('turkish')
tr_stopwords.extend([u"an",u"en",u"on",u"ön",u"ın",
u"un",u"ün",u"in",u"ta",u"te",
u"tu",u"tü",u"tı",u"ti",u"to",
u"tö",u"dan",u"den",u"dun",
u"dın",u"din"])
textToken=utils.simple_preprocess(text)
textToken=list(set(textToken)-set(tr_stopwords))
tagger=""
if categories == "dunya":
tagger="dunya_%i"%i
elif categories == "ekonomi":
tagger="ekonomi_%i"%i
elif categories == "kultur-sanat":
tagger="kultur-sanat_%i"%i
elif categories == "politika":
tagger="politika_%i" %i
elif categories == "spor":
tagger="spor_%i"%i
i+=1
corpusList.append({'id': tagger,
'tokens': textToken,
'text':text})
except:
pass
return corpusList
if __name__ == '__main__':
train_corpus=solrReadFile()#solr data corpus
utils.upload_chunked(server, train_corpus,
chunksize=1000) # send 1k docs at a time
#
server.train(train_corpus, method='lsi') # create a semantic model
server.index(train_corpus)
#test
testDataPath=os.path.join(os.path.dirname(__file__),"test")
filenames=os.listdir(testDataPath)
for filename in filenames:
print filename
f=io.open(os.path.join(testDataPath,filename),
mode = "r",
encoding = "utf8")
query=f.read().lower()
f.close()
doc = {'tokens': utils.simple_preprocess(query)}
print(server.find_similar(doc, min_score=0.43, max_results=50))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment