We are running into an intermittent issue in botvinnik-1
where certain queries originating from a caputil server cause database locks that prevent users from logging into the system entirely.
We’ve been unable to determine a specific trigger that causes this issue to occur, which has complicated our troubleshooting efforts.
Here’s what we see in postgres:
[root@pg-vd01 ~]# ps -ef | grep post
postgres 1467 1 0 Apr14 ? 00:08:08 /usr/pgsql-9.2/bin/postmaster -p 5432 -D /var/lib/pgsql/9.2/data
postgres 1470 1467 0 Apr14 ? 00:00:00 postgres: logger process
root 1597 1 0 Apr14 ? 00:00:34 /usr/libexec/postfix/master
postfix 1619 1597 0 Apr14 ? 00:00:05 qmgr -l -t fifo -u
postgres 1641 1467 0 Apr14 ? 01:27:46 postgres: checkpointer process
postgres 1642 1467 0 Apr14 ? 00:26:29 postgres: writer process
postgres 1643 1467 0 Apr14 ? 00:07:02 postgres: wal writer process
postgres 1644 1467 0 Apr14 ? 00:29:30 postgres: autovacuum launcher process
postgres 1645 1467 0 Apr14 ? 02:13:59 postgres: stats collector process
postgres 4296 1467 0 Aug12 ? 00:00:32 postgres: evm vmdb_production 10.193.16.150(34498) idle
postgres 5395 1467 0 Jul26 ? 00:01:11 postgres: evm vmdb_production 10.193.16.151(46328) idle
postgres 10383 1467 0 Jul27 ? 00:04:34 postgres: evm vmdb_production 10.193.16.151(47316) idle
postfix 10713 1597 0 08:27 ? 00:00:00 pickup -l -t fifo -u
root 11523 11227 0 10:03 pts/0 00:00:00 grep post
postgres 25562 1467 0 Jul20 ? 01:25:48 postgres: evm vmdb_production 10.193.16.150(48074) idle in transaction
postgres 25568 1467 0 Jul20 ? 00:00:01 postgres: evm vmdb_production 10.193.16.150(48084) idle
postgres 25570 1467 0 Jul20 ? 00:00:01 postgres: evm vmdb_production 10.193.16.150(48087) idle
postgres 25571 1467 0 Jul20 ? 00:02:32 postgres: evm vmdb_production 10.193.16.150(48090) idle
postgres 25572 1467 0 Jul20 ? 00:02:29 postgres: evm vmdb_production 10.193.16.150(48091) idle
postgres 25573 1467 0 Jul20 ? 00:11:44 postgres: evm vmdb_production 10.193.16.150(48094) idle
postgres 25574 1467 0 Jul20 ? 00:00:01 postgres: evm vmdb_production 10.193.16.150(48104) idle
postgres 25576 1467 0 Jul20 ? 00:08:09 postgres: evm vmdb_production 10.193.16.150(48111) idle
postgres 25577 1467 0 Jul20 ? 00:00:05 postgres: evm vmdb_production 10.193.16.150(48112) idle
postgres 25580 1467 0 Jul20 ? 00:00:01 postgres: evm vmdb_production 10.193.16.150(48126) idle
postgres 25581 1467 0 Jul20 ? 00:00:27 postgres: evm vmdb_production 10.193.16.150(48128) idle
postgres 25582 1467 0 Jul20 ? 00:00:27 postgres: evm vmdb_production 10.193.16.150(48129) idle
postgres 25583 1467 0 Jul20 ? 04:35:55 postgres: evm vmdb_production 10.193.16.150(48139) idle
postgres 25584 1467 0 Jul20 ? 04:39:20 postgres: evm vmdb_production 10.193.16.150(48140) idle
postgres 25590 1467 0 Jul20 ? 00:00:01 postgres: evm vmdb_production 10.193.16.150(48150) idle
postgres 30319 1467 0 Jul23 ? 00:56:53 postgres: evm vmdb_production 10.193.16.151(38351) SELECT waiting
postgres 30324 1467 0 Jul23 ? 00:03:12 postgres: evm vmdb_production 10.193.16.151(38361) idle
postgres 30325 1467 0 Jul23 ? 00:03:10 postgres: evm vmdb_production 10.193.16.151(38363) idle
postgres 30326 1467 0 Jul23 ? 00:01:24 postgres: evm vmdb_production 10.193.16.151(38364) idle
postgres 30328 1467 0 Jul23 ? 00:01:29 postgres: evm vmdb_production 10.193.16.151(38369) idle
postgres 30335 1467 0 Jul23 ? 00:05:50 postgres: evm vmdb_production 10.193.16.151(38381) idle
postgres 30336 1467 0 Jul23 ? 00:00:01 postgres: evm vmdb_production 10.193.16.151(38387) idle
postgres 30338 1467 0 Jul23 ? 00:55:53 postgres: evm vmdb_production 10.193.16.151(38390) idle
postgres 30486 1467 0 Jul23 ? 00:55:20 postgres: evm vmdb_production 10.193.16.151(38445) idle
10.193.16.151
is the server with the Capacity & Utilization role.
vmdb_production=# SELECT relation::regclass, * FROM pg_locks WHERE NOT GRANTED;
-[ RECORD 1 ]------+--------------
relation |
locktype | transactionid
database |
relation |
page |
tuple |
virtualxid |
transactionid | 247726421
classid |
objid |
objsubid |
virtualtransaction | 18/15671302
pid | 30319
mode | ShareLock
granted | f
fastpath | f
vmdb_production=# SELECT blocked_locks.pid AS blocked_pid,
vmdb_production-# blocked_activity.usename AS blocked_user,
vmdb_production-# blocking_locks.pid AS blocking_pid,
vmdb_production-# blocking_activity.usename AS blocking_user,
vmdb_production-# blocked_activity.query AS blocked_statement,
vmdb_production-# blocking_activity.query AS blocking_statement
vmdb_production-# FROM pg_catalog.pg_locks blocked_locks
vmdb_production-# JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
vmdb_production-# JOIN pg_catalog.pg_locks blocking_locks
vmdb_production-# ON blocking_locks.locktype = blocked_locks.locktype
vmdb_production-# AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
vmdb_production-# AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
vmdb_production-# AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
vmdb_production-# AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
vmdb_production-# AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
vmdb_production-# AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
vmdb_production-# AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
vmdb_production-# AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
vmdb_production-# AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
vmdb_production-# AND blocking_locks.pid != blocked_locks.pid
vmdb_production-# JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
vmdb_production-# WHERE NOT blocked_locks.GRANTED;
-[ RECORD 1 ]------+--------------------------------------------------------------------------------------------------------------------------------------------------------
blocked_pid | 30319
blocked_user | evm
blocking_pid | 25562
blocking_user | evm
blocked_statement | SELECT "miq_regions".* FROM "miq_regions" WHERE "miq_regions"."id" = $1 LIMIT 1 FOR UPDATE
blocking_statement | SELECT "server_roles".* FROM "server_roles" INNER JOIN "assigned_server_roles" ON "server_roles"."id" = "assigned_server_roles"."server_role_id" WHERE "assigned_server_roles"."miq_server_id" = 1
vmdb_production=# SELECT bl.pid AS blocked_pid,
vmdb_production-# a.usename AS blocked_user,
vmdb_production-# ka.query AS blocking_statement,
vmdb_production-# now() - ka.query_start AS blocking_duration,
vmdb_production-# kl.pid AS blocking_pid,
vmdb_production-# ka.usename AS blocking_user,
vmdb_production-# a.query AS blocked_statement,
vmdb_production-# now() - a.query_start AS blocked_duration
vmdb_production-# FROM pg_catalog.pg_locks bl
vmdb_production-# JOIN pg_catalog.pg_stat_activity a ON a.pid = bl.pid
vmdb_production-# JOIN pg_catalog.pg_locks kl ON kl.transactionid = bl.transactionid AND kl.pid != bl.pid
vmdb_production-# JOIN pg_catalog.pg_stat_activity ka ON ka.pid = kl.pid
vmdb_production-# WHERE NOT bl.GRANTED;
-[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------
blocked_pid | 30319
blocked_user | evm
blocking_statement | SELECT "server_roles".* FROM "server_roles" INNER JOIN "assigned_server_roles" ON "server_roles"."id" = "assigned_server_roles"."server_role_id" WHERE "assigned_server_roles"."miq_server_id" = 1
blocking_duration | 6 days 00:19:00.233021
blocking_pid | 25562
blocking_user | evm
blocked_statement | SELECT "miq_regions".* FROM "miq_regions" WHERE "miq_regions"."id" = $1 LIMIT 1 FOR UPDATE
blocked_duration | 6 days 00:16:34.606241
There are no obvious errors in evm.log
.
Once the lock has occurred, production.log
begins to fill with the following:
[----] I, [2015-08-18T03:51:14.378205 #23674:f87e88] INFO -- : Started HEAD "/" for 127.0.0.1 at 2015-08-18 03:51:14 -0700
[----] I, [2015-08-18T03:51:14.380987 #23674:f87e88] INFO -- : Processing by DashboardController#login as HTML
[----] I, [2015-08-18T03:51:14.399345 #23674:f87e88] INFO -- : Rendered layouts/_spinner.html.haml (1.0ms)
[----] I, [2015-08-18T03:51:14.399784 #23674:f87e88] INFO -- : Rendered layouts/_flash_msg.html.haml (0.1ms)
[----] I, [2015-08-18T03:51:14.411621 #23674:f87e88] INFO -- : Rendered dashboard/login.html.haml within layouts/login (13.9ms)
[----] I, [2015-08-18T03:51:14.417160 #23674:f87e88] INFO -- : Completed 200 OK in 36.0ms (Views: 18.3ms | ActiveRecord: 2.6ms)
[----] I, [2015-08-18T03:51:19.376914 #23674:f87e88] INFO -- : Started HEAD "/" for 127.0.0.1 at 2015-08-18 03:51:19 -0700
[----] I, [2015-08-18T03:51:19.379460 #23674:f87e88] INFO -- : Processing by DashboardController#login as HTML
[----] I, [2015-08-18T03:51:19.398037 #23674:f87e88] INFO -- : Rendered layouts/_spinner.html.haml (1.0ms)
[----] I, [2015-08-18T03:51:19.398539 #23674:f87e88] INFO -- : Rendered layouts/_flash_msg.html.haml (0.2ms)
[----] I, [2015-08-18T03:51:19.410368 #23674:f87e88] INFO -- : Rendered dashboard/login.html.haml within layouts/login (13.9ms)
[----] I, [2015-08-18T03:51:19.415882 #23674:f87e88] INFO -- : Completed 200 OK in 36.3ms (Views: 18.5ms | ActiveRecord: 2.5ms)
[----] I, [2015-08-18T03:51:24.377232 #23674:f87e88] INFO -- : Started HEAD "/" for 127.0.0.1 at 2015-08-18 03:51:24 -0700
[----] I, [2015-08-18T03:51:24.379847 #23674:f87e88] INFO -- : Processing by DashboardController#login as HTML
[----] I, [2015-08-18T03:51:24.619502 #23674:f87e88] INFO -- : Rendered layouts/_spinner.html.haml (1.0ms)
[----] I, [2015-08-18T03:51:24.621313 #23674:f87e88] INFO -- : Rendered layouts/_flash_msg.html.haml (0.1ms)
[----] I, [2015-08-18T03:51:24.634158 #23674:f87e88] INFO -- : Rendered dashboard/login.html.haml within layouts/login (16.3ms)
[----] I, [2015-08-18T03:51:24.639934 #23674:f87e88] INFO -- : Completed 200 OK in 260.0ms (Views: 21.1ms | ActiveRecord: 2.7ms)
[----] I, [2015-08-18T03:51:29.377451 #23674:f87e88] INFO -- : Started HEAD "/" for 127.0.0.1 at 2015-08-18 03:51:29 -0700
...etc...
Is anyone else seeing this issue?