Zach Dunn zadunn

## problem_statement.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                zadunn
                / problem_statement.md
            
            
              Created
              June 14, 2017 19:52
            
              
                nginx epoll error
              
          
    dockered nginx instances show the following error under load:
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)
2017/06/13 14:26:50 [alert] 4051#4051: epoll_wait() failed (1: Operation not permitted)


## gist:b0b5e239fa753dacd93e85bae7205330
Unfreezing cluster state

/opt/smartdc/sdcadm/node_modules/sdc-clients/node_modules/assert-plus/assert.js:45
                        throw new assert.AssertionError({
                              ^
AssertionError: opts.started (number) is required
    at SAPI.addHistory (/opt/smartdc/sdcadm/node_modules/sdc-clients/lib/sapi.js:503:12)
    at /opt/smartdc/sdcadm/lib/history.js:418:30
    at /opt/smartdc/sdcadm/node_modules/sdc-clients/lib/restifyclient.js:110:20
    at parseResponse (/opt/smartdc/sdcadm/node_modules/sdc-clients/node_modules/restify/lib/clients/json_client.js:84:9)

## gist:f8d5c63a48646e5c783d82e9007c6b07
[2016-08-02T22:25:45.876Z] ERROR: manatee-sitter/cluster/9639 on 9a1b7646-92e1-4d09-ad4b-a31e1e20554b (/opt/smartdc/manatee/node_modules/manatee/node_modules/manatee-state-machine/lib/manatee
-peer.js:1088 in finishPgApply):
    applying pg config: postgres exited unexpectedly (code 1); stdout = , stderr = 2016-08-02 22:25:38.767 UTCFATAL:  could not create shared memory segment: Invalid argument
    2016-08-02 22:25:38.767 UTCDETAIL:  Failed system call was shmget(key=5432001, size=2262786048, 03600).
    2016-08-02 22:25:38.767 UTCHINT:  This error usually means that PostgreSQL's request for a shared memory segment exceeded your kernel's SHMMAX parameter.  You can either reduce the reques
t size or reconfigure the kernel with larger SHMMAX.  To reduce the request size (currently 2262786048 bytes), reduce PostgreSQL's shared memory usage, perhaps by reducing shared_buffers or m
ax_connections.
        If the request size is already small, it's possible that it is less than your kernel's SHMMIN parameter, in

## from async

[root@9a1b7646-92e1-4d09-ad4b-a31e1e20554b (iad3:manatee1) ~]# manatee-adm pg-status
ROLE     PEER     PG   REPL  SENT          FLUSH         REPLAY        LAG
primary  f162e187 ok   sync  3A/947B8CE8   3A/947B8CE8   3A/947B8940   -
sync     37f1156e ok   -     -             -             -             -
async    9a1b7646 fail -     -             -             -             -

warning: peer "37f1156e": downstream replication peer not connected

## from primary
[root@f162e187-d677-4da8-ab24-63ff5a290d73 (iad3:manatee2) ~]# manatee-adm pg-status
ROLE     PEER     PG   REPL  SENT          FLUSH         REPLAY        LAG
primary  f162e187 ok   sync  3A/9444F7D8   3A/9444F7D8   3A/9444F430   -
sync     37f1156e ok   -     -             -             -             -
async    9a1b7646 fail -     -             -             -             -

warning: peer "37f1156e": downstream replication peer not connected

## from sync
[root@37f1156e-e682-4876-9668-eb752e307f13 (iad3:manatee0) ~]# manatee-adm pg-status
ROLE     PEER     PG   REPL  SENT          FLUSH         REPLAY        LAG
primary  f162e187 ok   sync  3A/942DB868   3A/942DB868   3A/942DB4C0   -
sync     37f1156e ok   -     -             -             -             -
async    9a1b7646 fail -     -             -             -             -

warning: peer "37f1156e": downstream replication peer not connected

## gist:fcc146f3dbfddaf09a86c714e4defda4
[root@bugbear-torpedo (iad3) ~]# imgadm import d370f5f2-8beb-4abf-d676-8cdae3c37d63
imgadm import: error (ActiveImageNotFound): an active image "d370f5f2-8beb-4abf-d676-8cdae3c37d63" was not found in image sources

## gist:1c46cbc5f7262aa9218bc2d8e1b9acf8
[root@headnode (iad3) ~]# sdc-sapi /instances/9a1b7646-92e1-4d09-ad4b-a31e1e20554b/upgrade -X PUT -d '{"image_uuid": "d370f5f2-8beb-4abf-d676-8cdae3c37d63"}'
HTTP/1.1 500 Internal Server Error
Content-Type: application/json
Content-Length: 1211
Date: Tue, 02 Aug 2016 21:52:13 GMT
Connection: keep-alive

{
  "code": "ReprovisionFailedError",
  "message": "job reprovision-7.0.3 (7198789b-0c8f-4c8a-bac7-a49198a2bb87) failed: cnapi.wait_task: mage-id validated\",\"time\":\"2016-08-02T21:52:11.198Z\",\"v\":0}\n{\"name\":\"imgadm\",\"req_id\":\"5c2457a0-58fb-11e6-b441-23127a880929\",\"hostname\":\"bugbear-torpedo.us-east.optoro.io\",\"pid\":4087,\"level\":20,\"subcmd\":\"import\",\"exitStatus\":1,\"cli\":true,\"msg\":\"cli exit\",\"time\":\"2016-08-02T21:52:11.308Z\",\"v\":0}\nimgadm import: error (ActiveImageNotFound): ActiveImageNotFoundError: an active image \"d370f5f2-8beb-4abf-d676-8cdae3c37d63\" was not found in image sources\n    at /usr/img/lib/cli.js:1386:26\n    at finish (/usr/img/lib/imgadm.js:1127:9)\n

## gist:0faadbd55169064f04ed4430398452db
[2016-08-02T21:31:24.036Z] ERROR: manatee-sitter/cluster/86857 on 9a1b7646-92e1-4d09-ad4b-a31e1e20554b (/opt/smartdc/manatee/node_modules/manatee/node_modules/manatee-state-machine/lib/manatee-peer.js:1088 in finishPgApply):
    applying pg config: postgres exited unexpectedly (code 1); stdout = , stderr = 2016-08-02 21:31:22.163 UTCFATAL:  could not create shared memory segment: Invalid argument
    2016-08-02 21:31:22.163 UTCDETAIL:  Failed system call was shmget(key=5432001, size=2262786048, 03600).
    2016-08-02 21:31:22.163 UTCHINT:  This error usually means that PostgreSQL's request for a shared memory segment exceeded your kernel's SHMMAX parameter.  You can either reduce the request size or reconfigure the kernel with larger SHMMAX.  To reduce the request size (currently 2262786048 bytes), reduce PostgreSQL's shared memory usage, perhaps by reducing shared_buffers or max_connections.
        If the request size is already small, it's possible that it is less than your kernel's SHMMIN parameter, in wh

## gist:12407eca1316f9c9c730d53c12c0b40d
[root@headnode (iad3) ~]# sdcadm insts manatee
INSTANCE                              SERVICE  HOSTNAME                           VERSION                                     ALIAS
f162e187-d677-4da8-ab24-63ff5a290d73  manatee  bugbear-kermit.us-east.optoro.io   release-20160707-20160707T033728Z-g8e996f2  manatee2
9a1b7646-92e1-4d09-ad4b-a31e1e20554b  manatee  bugbear-torpedo.us-east.optoro.io  release-20160707-20160707T033728Z-g8e996f2  manatee1
37f1156e-e682-4876-9668-eb752e307f13  manatee  headnode                           release-20160721-20160721T181401Z-g1f05fea  manatee0
	Unfreezing cluster state

	/opt/smartdc/sdcadm/node_modules/sdc-clients/node_modules/assert-plus/assert.js:45
	throw new assert.AssertionError({
	^
	AssertionError: opts.started (number) is required
	at SAPI.addHistory (/opt/smartdc/sdcadm/node_modules/sdc-clients/lib/sapi.js:503:12)
	at /opt/smartdc/sdcadm/lib/history.js:418:30
	at /opt/smartdc/sdcadm/node_modules/sdc-clients/lib/restifyclient.js:110:20
	at parseResponse (/opt/smartdc/sdcadm/node_modules/sdc-clients/node_modules/restify/lib/clients/json_client.js:84:9)
	[2016-08-02T22:25:45.876Z] ERROR: manatee-sitter/cluster/9639 on 9a1b7646-92e1-4d09-ad4b-a31e1e20554b (/opt/smartdc/manatee/node_modules/manatee/node_modules/manatee-state-machine/lib/manatee
	-peer.js:1088 in finishPgApply):
	applying pg config: postgres exited unexpectedly (code 1); stdout = , stderr = 2016-08-02 22:25:38.767 UTCFATAL: could not create shared memory segment: Invalid argument
	2016-08-02 22:25:38.767 UTCDETAIL: Failed system call was shmget(key=5432001, size=2262786048, 03600).
	2016-08-02 22:25:38.767 UTCHINT: This error usually means that PostgreSQL's request for a shared memory segment exceeded your kernel's SHMMAX parameter. You can either reduce the reques
	t size or reconfigure the kernel with larger SHMMAX. To reduce the request size (currently 2262786048 bytes), reduce PostgreSQL's shared memory usage, perhaps by reducing shared_buffers or m
	ax_connections.
	If the request size is already small, it's possible that it is less than your kernel's SHMMIN parameter, in

	[root@9a1b7646-92e1-4d09-ad4b-a31e1e20554b (iad3:manatee1) ~]# manatee-adm pg-status
	ROLE PEER PG REPL SENT FLUSH REPLAY LAG
	primary f162e187 ok sync 3A/947B8CE8 3A/947B8CE8 3A/947B8940 -
	sync 37f1156e ok - - - - -
	async 9a1b7646 fail - - - - -

	warning: peer "37f1156e": downstream replication peer not connected
	[root@f162e187-d677-4da8-ab24-63ff5a290d73 (iad3:manatee2) ~]# manatee-adm pg-status
	ROLE PEER PG REPL SENT FLUSH REPLAY LAG
	primary f162e187 ok sync 3A/9444F7D8 3A/9444F7D8 3A/9444F430 -
	sync 37f1156e ok - - - - -
	async 9a1b7646 fail - - - - -

	warning: peer "37f1156e": downstream replication peer not connected
	[root@37f1156e-e682-4876-9668-eb752e307f13 (iad3:manatee0) ~]# manatee-adm pg-status
	ROLE PEER PG REPL SENT FLUSH REPLAY LAG
	primary f162e187 ok sync 3A/942DB868 3A/942DB868 3A/942DB4C0 -
	sync 37f1156e ok - - - - -
	async 9a1b7646 fail - - - - -

	warning: peer "37f1156e": downstream replication peer not connected
	[root@bugbear-torpedo (iad3) ~]# imgadm import d370f5f2-8beb-4abf-d676-8cdae3c37d63
	imgadm import: error (ActiveImageNotFound): an active image "d370f5f2-8beb-4abf-d676-8cdae3c37d63" was not found in image sources
	[root@headnode (iad3) ~]# sdc-sapi /instances/9a1b7646-92e1-4d09-ad4b-a31e1e20554b/upgrade -X PUT -d '{"image_uuid": "d370f5f2-8beb-4abf-d676-8cdae3c37d63"}'
	HTTP/1.1 500 Internal Server Error
	Content-Type: application/json
	Content-Length: 1211
	Date: Tue, 02 Aug 2016 21:52:13 GMT
	Connection: keep-alive

	{
	"code": "ReprovisionFailedError",
	"message": "job reprovision-7.0.3 (7198789b-0c8f-4c8a-bac7-a49198a2bb87) failed: cnapi.wait_task: mage-id validated\",\"time\":\"2016-08-02T21:52:11.198Z\",\"v\":0}\n{\"name\":\"imgadm\",\"req_id\":\"5c2457a0-58fb-11e6-b441-23127a880929\",\"hostname\":\"bugbear-torpedo.us-east.optoro.io\",\"pid\":4087,\"level\":20,\"subcmd\":\"import\",\"exitStatus\":1,\"cli\":true,\"msg\":\"cli exit\",\"time\":\"2016-08-02T21:52:11.308Z\",\"v\":0}\nimgadm import: error (ActiveImageNotFound): ActiveImageNotFoundError: an active image \"d370f5f2-8beb-4abf-d676-8cdae3c37d63\" was not found in image sources\n at /usr/img/lib/cli.js:1386:26\n at finish (/usr/img/lib/imgadm.js:1127:9)\n
	[2016-08-02T21:31:24.036Z] ERROR: manatee-sitter/cluster/86857 on 9a1b7646-92e1-4d09-ad4b-a31e1e20554b (/opt/smartdc/manatee/node_modules/manatee/node_modules/manatee-state-machine/lib/manatee-peer.js:1088 in finishPgApply):
	applying pg config: postgres exited unexpectedly (code 1); stdout = , stderr = 2016-08-02 21:31:22.163 UTCFATAL: could not create shared memory segment: Invalid argument
	2016-08-02 21:31:22.163 UTCDETAIL: Failed system call was shmget(key=5432001, size=2262786048, 03600).
	2016-08-02 21:31:22.163 UTCHINT: This error usually means that PostgreSQL's request for a shared memory segment exceeded your kernel's SHMMAX parameter. You can either reduce the request size or reconfigure the kernel with larger SHMMAX. To reduce the request size (currently 2262786048 bytes), reduce PostgreSQL's shared memory usage, perhaps by reducing shared_buffers or max_connections.
	If the request size is already small, it's possible that it is less than your kernel's SHMMIN parameter, in wh
	[root@headnode (iad3) ~]# sdcadm insts manatee
	INSTANCE SERVICE HOSTNAME VERSION ALIAS
	f162e187-d677-4da8-ab24-63ff5a290d73 manatee bugbear-kermit.us-east.optoro.io release-20160707-20160707T033728Z-g8e996f2 manatee2
	9a1b7646-92e1-4d09-ad4b-a31e1e20554b manatee bugbear-torpedo.us-east.optoro.io release-20160707-20160707T033728Z-g8e996f2 manatee1
	37f1156e-e682-4876-9668-eb752e307f13 manatee headnode release-20160721-20160721T181401Z-g1f05fea manatee0