TimRepke/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Solr import/export

You need to move from one solr instance to another and can't be bothered with mismatching versions or whatever?
These two scripts will help you :)
First you need to create a new core in the target instance.
You may want to use the schema/configset from the originating instance though, as the default schema might not be ideal.
Im my scenario I moved from Solr 5.5.5 to Solr 7.4.
Therefore I had to (at least) update the solrconfig.xml, where the lucene version is specified.
The exact version you need can be found in the default configset ([solr_root]/server/solr/configsets/...)
The first script downloads and entire core into one file (one json document per line).
python download.py 172.0.0.1 8983 my_core my_core.json

The second one uploads it to a server.
python uploadSolr.py 172.0.0.1 8983 my_core my_core.json

IMPORTANT
The download script is extremely stupid. Once it's done, it will continue requesting the server.
Fortunately tqdm will give you a rough estimate for how long it may take to finish, so you know how long your coffe break can be. There will be no harm if it runs for a while (except heating up your server). Once the progress bar is full, kill the script with Ctrl+C.
FYI: Downloading 2.5G (380k docs) took 5 minutes, uploading them again took just over an hour.
Requirements to install:

pysolr
requests
tqdm

You could remove tqdm from the code. It's for a nice progress bar. requests could also be removed, but I'm to lazy to update the download scrip to also use pysolr (or vice versa).

  
## download_solr.py
import requests
import json
import argparse
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Download all documents from a solr core')
parser.add_argument('HOST', help='Host of solr server, e.g. 172.0.0.1 for localhost')
parser.add_argument('PORT', help='Port of solr server, e.g. 8983 for default port')
parser.add_argument('CORE', help='Core to fetch from server')
parser.add_argument('TARGET', help='target file to write to (one line = one doc as json)')
parser.add_argument('--rows', help='number of rows per request', default=100)
args = parser.parse_args()

target_dir = ''
solr_url = args.HOST
solr_port = args.PORT
solr_core = args.CORE

r = requests.get('http://{}:{}/solr/{}/select?q=*%3A*&rows=1&sort=id+asc&wt=json&cursorMark=*'.format(
    solr_url, solr_port, solr_core))
res = r.json()
num_results = res['response']['numFound']

with tqdm(total=num_results) as pbar, open(args.TARGET, 'w') as f:
    cursor_mark = '*'
    while True:
        r = requests.get('http://{}:{}/solr/{}/select?q=*%3A*&rows={}&sort=id+asc&wt=json&cursorMark={}'.format(
            solr_url, solr_port, solr_core, args.rows, cursor_mark))
        res = r.json()

        for doc in res['response']['docs']:
            f.write(json.dumps(doc) + '\n')

        # alternative log to tqdm:
        # print('Got {} docs | {}/{}  ({:.2f}%)'.format(len(res['response']['docs']),
        #                                              cnt, res['response']['numFound'],
        #                                              cnt / res['response']['numFound']))
        pbar.update(len(res['response']['docs']))
        cursor_mark = res['nextCursorMark']

## upload_solr.py
import pysolr
import json
import argparse
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Download all documents from a solr core')
parser.add_argument('HOST', help='Host of solr server, e.g. 172.0.0.1 for localhost')
parser.add_argument('PORT', help='Port of solr server, e.g. 8983 for default port')
parser.add_argument('CORE', help='Core to write to on server')
parser.add_argument('SOURCE', help='target file to read from (one line = one doc as json)')
parser.add_argument('--rows', help='number of rows per request', default=100)
args = parser.parse_args()

target_dir = ''
solr_url = args.HOST
solr_port = args.PORT
solr_core = args.CORE

client = pysolr.Solr('http://{}:{}/solr/{}'.format(solr_url, solr_port, solr_core))
num_docs = sum(1 for line in open(args.SOURCE))

with tqdm(total=num_docs) as pbar, open(args.SOURCE, 'r') as f:
    buffer = []
    for line in f:
        doc = json.loads(line)
        del doc['_version_']
        buffer.append(doc)
        if len(buffer) >= args.rows:
            client.add(buffer)
            pbar.update(len(buffer))
            buffer = []
    client.add(buffer)
    pbar.update(len(buffer))
	import requests
	import json
	import argparse
	from tqdm import tqdm

	parser = argparse.ArgumentParser(description='Download all documents from a solr core')
	parser.add_argument('HOST', help='Host of solr server, e.g. 172.0.0.1 for localhost')
	parser.add_argument('PORT', help='Port of solr server, e.g. 8983 for default port')
	parser.add_argument('CORE', help='Core to fetch from server')
	parser.add_argument('TARGET', help='target file to write to (one line = one doc as json)')
	parser.add_argument('--rows', help='number of rows per request', default=100)
	args = parser.parse_args()

	target_dir = ''
	solr_url = args.HOST
	solr_port = args.PORT
	solr_core = args.CORE

	r = requests.get('http://{}:{}/solr/{}/select?q=%3A&rows=1&sort=id+asc&wt=json&cursorMark=*'.format(
	solr_url, solr_port, solr_core))
	res = r.json()
	num_results = res['response']['numFound']

	with tqdm(total=num_results) as pbar, open(args.TARGET, 'w') as f:
	cursor_mark = '*'
	while True:
	r = requests.get('http://{}:{}/solr/{}/select?q=%3A&rows={}&sort=id+asc&wt=json&cursorMark={}'.format(
	solr_url, solr_port, solr_core, args.rows, cursor_mark))
	res = r.json()

	for doc in res['response']['docs']:
	f.write(json.dumps(doc) + '\n')

	# alternative log to tqdm:
	# print('Got {} docs \| {}/{} ({:.2f}%)'.format(len(res['response']['docs']),
	# cnt, res['response']['numFound'],
	# cnt / res['response']['numFound']))
	pbar.update(len(res['response']['docs']))
	cursor_mark = res['nextCursorMark']
	import pysolr
	import json
	import argparse
	from tqdm import tqdm

	parser = argparse.ArgumentParser(description='Download all documents from a solr core')
	parser.add_argument('HOST', help='Host of solr server, e.g. 172.0.0.1 for localhost')
	parser.add_argument('PORT', help='Port of solr server, e.g. 8983 for default port')
	parser.add_argument('CORE', help='Core to write to on server')
	parser.add_argument('SOURCE', help='target file to read from (one line = one doc as json)')
	parser.add_argument('--rows', help='number of rows per request', default=100)
	args = parser.parse_args()

	target_dir = ''
	solr_url = args.HOST
	solr_port = args.PORT
	solr_core = args.CORE

	client = pysolr.Solr('http://{}:{}/solr/{}'.format(solr_url, solr_port, solr_core))
	num_docs = sum(1 for line in open(args.SOURCE))

	with tqdm(total=num_docs) as pbar, open(args.SOURCE, 'r') as f:
	buffer = []
	for line in f:
	doc = json.loads(line)
	del doc['_version_']
	buffer.append(doc)
	if len(buffer) >= args.rows:
	client.add(buffer)
	pbar.update(len(buffer))
	buffer = []
	client.add(buffer)
	pbar.update(len(buffer))