haridas/find_error_lines_in_json.py

## find_error_lines_in_json.py
import json
def read_json_lines(fname, filed_name):
    num = 0
    doc_size = []
    error_docs = []
    with open(fname) as f:
       while True:
           line = f.readline()
           if not line:
               break
           print (num)
           num += 1
           d = json.loads(line)
           try:
               doc_size.append(len(d[field_name].split()))
           except Exception as ex:
               error_docs.append(d)
    return doc_size, error_docs

 doc_size, error_docs = read_json_lines("./file.json", "data")

# Quick analysis of doc size, when the pd.read_json / pd.read_csv fails to read the orginal file.
# Fix the error_docs by removing or updating it.
doc_df = pd.DataFrame([doc_size])
doc_df.describe()
	import json
	def read_json_lines(fname, filed_name):
	num = 0
	doc_size = []
	error_docs = []
	with open(fname) as f:
	while True:
	line = f.readline()
	if not line:
	break
	print (num)
	num += 1
	d = json.loads(line)
	try:
	doc_size.append(len(d[field_name].split()))
	except Exception as ex:
	error_docs.append(d)
	return doc_size, error_docs

	doc_size, error_docs = read_json_lines("./file.json", "data")

	# Quick analysis of doc size, when the pd.read_json / pd.read_csv fails to read the orginal file.
	# Fix the error_docs by removing or updating it.
	doc_df = pd.DataFrame([doc_size])
	doc_df.describe()