likid0/ceph_large_omap_objets_warning

## ceph_large_omap_objets_warning
#Check bucket limits:
$ radosgw-admin bucket limit check | jq '.[].buckets[].fill_status' | uniq
$ radosgw-admin bucket limit check | jq '.[].buckets[].objects_per_shard'  | sort -n | uniq


#Default WARN Thresholds:
"osd_deep_scrub_large_omap_object_key_threshold": "2000000",
"osd_deep_scrub_large_omap_object_value_sum_threshold": "1073741824"  #1G

#Warnings you get in the cluster log:
In cluster log:
Key threshold(default): cluster [WRN] Large omap object found. Object: 5:755f2873:::omap_obj_1541077_0:head Key count: 2000001 Size (bytes): 24888903
value threshold(tested with 200MB): cluster [WRN] Large omap object found. Object: 9:b68141c4:::omap_obj_1646389_0:head Key count: 1900000 Size (bytes): 235888900

#The value at wich the warning is reported to ceph can be changed
$ ceph daemon mon.ceph-mon-1 config get osd_deep_scrub_large_omap_object_value_sum_threshold
{
    "osd_deep_scrub_large_omap_object_value_sum_threshold": "2147483648"
}


#Query all PGs from pool and list pgs with large omap files:

$ for i in `ceph pg ls-by-pool tor.rgw.buckets.index | grep -v NOTE | tail -n +2 | awk '{print $1}'`; do echo -n "$i: "; ceph pg $i query | grep num_large_omap_objects | head -1 | awk '{print $2}'; done

#Get on wich pg is stored the omap object for a bucket:

$ for i in $buckets; do id=`radosgw-admin bucket stats --bucket $i |grep \"id\" | cut -d\" -f4` ; pg=`ceph osd map tor.rgw.buckets.index ${id} | awk '{print $11}' | cut -d\( -f2 | cut -d\) -f1` ; echo "$i:$id:$pg"; done


#Once we have the pgs with large omap objects we can deep scrub the OSD to wich they belong:


# ceph pg map 2.26
osdmap e8768 pg 2.26 (2.26) -> up [29,94,37] acting [29,94,37]

#Now that we’ve identified which OSDs are hosting the large omap objects, we need to run a deep scrub on them.

ceph osd deep-scrub osd.29
ceph osd deep-scrub osd.37
ceph osd deep-scrub osd.94

# Once the scrub completes on the OSDs, the cluster should be in a healthy state again.
	#Check bucket limits:
	$ radosgw-admin bucket limit check \| jq '.[].buckets[].fill_status' \| uniq
	$ radosgw-admin bucket limit check \| jq '.[].buckets[].objects_per_shard' \| sort -n \| uniq


	#Default WARN Thresholds:
	"osd_deep_scrub_large_omap_object_key_threshold": "2000000",
	"osd_deep_scrub_large_omap_object_value_sum_threshold": "1073741824" #1G

	#Warnings you get in the cluster log:
	In cluster log:
	Key threshold(default): cluster [WRN] Large omap object found. Object: 5:755f2873:::omap_obj_1541077_0:head Key count: 2000001 Size (bytes): 24888903
	value threshold(tested with 200MB): cluster [WRN] Large omap object found. Object: 9:b68141c4:::omap_obj_1646389_0:head Key count: 1900000 Size (bytes): 235888900

	#The value at wich the warning is reported to ceph can be changed
	$ ceph daemon mon.ceph-mon-1 config get osd_deep_scrub_large_omap_object_value_sum_threshold
	{
	"osd_deep_scrub_large_omap_object_value_sum_threshold": "2147483648"
	}



	#Query all PGs from pool and list pgs with large omap files:

	$ for i in `ceph pg ls-by-pool tor.rgw.buckets.index \| grep -v NOTE \| tail -n +2 \| awk '{print $1}'`; do echo -n "$i: "; ceph pg $i query \| grep num_large_omap_objects \| head -1 \| awk '{print $2}'; done

	#Get on wich pg is stored the omap object for a bucket:

	$ for i in $buckets; do id=`radosgw-admin bucket stats --bucket $i \|grep \"id\" \| cut -d\" -f4` ; pg=`ceph osd map tor.rgw.buckets.index ${id} \| awk '{print $11}' \| cut -d\( -f2 \| cut -d\) -f1` ; echo "$i:$id:$pg"; done


	#Once we have the pgs with large omap objects we can deep scrub the OSD to wich they belong:


	# ceph pg map 2.26
	osdmap e8768 pg 2.26 (2.26) -> up [29,94,37] acting [29,94,37]

	#Now that we’ve identified which OSDs are hosting the large omap objects, we need to run a deep scrub on them.

	ceph osd deep-scrub osd.29
	ceph osd deep-scrub osd.37
	ceph osd deep-scrub osd.94

	# Once the scrub completes on the OSDs, the cluster should be in a healthy state again.