declare
p_failed_node alias for $1;
p_backup_node alias for $2;
v_row record;
v_row2 record;
v_n int4;
begin
-- ----
-- Grab the central configuration lock
-- ----
lock table sl_config_lock;
-- ----
-- All consistency checks first
-- Check that every node that has a path to the failed node
-- also has a path to the backup node.
-- ----
for v_row in select P.pa_client
from sl_path P
where P.pa_server = p_failed_node
and P.pa_client <> p_backup_node
and not exists (select true from sl_path PP
where PP.pa_server = p_backup_node
and PP.pa_client = P.pa_client)
loop
raise exception 'Slony-I: cannot failover - node % has no path to the backup node',
v_row.pa_client;
end loop;
-- ----
-- Check all sets originating on the failed node
-- ----
for v_row in select set_id
from sl_set
where set_origin = p_failed_node
loop
-- ----
-- Check that the backup node is subscribed to all sets
-- that originate on the failed node
-- ----
select into v_row2 sub_forward, sub_active
from sl_subscribe
where sub_set = v_row.set_id
and sub_receiver = p_backup_node;
if not found then
raise exception 'Slony-I: cannot failover - node % is not subscribed to set %',
p_backup_node, v_row.set_id;
end if;
-- ----
-- Check that the subscription is active
-- ----
if not v_row2.sub_active then
raise exception 'Slony-I: cannot failover - subscription for set % is not active',
v_row.set_id;
end if;
-- ----
-- If there are other subscribers, the backup node needs to
-- be a forwarder too.
-- ----
select into v_n count(*)
from sl_subscribe
where sub_set = v_row.set_id
and sub_receiver <> p_backup_node;
if v_n > 0 and not v_row2.sub_forward then
raise exception 'Slony-I: cannot failover - node % is not a forwarder of set %',
p_backup_node, v_row.set_id;
end if;
end loop;
-- ----
-- Terminate all connections of the failed node the hard way
-- ----
perform terminateNodeConnections(p_failed_node);
-- ----
-- Move the sets
-- ----
for v_row in select S.set_id, (select count(*)
from sl_subscribe SUB
where S.set_id = SUB.sub_set
and SUB.sub_receiver <> p_backup_node
and SUB.sub_provider = p_failed_node)
as num_direct_receivers
from sl_set S
where S.set_origin = p_failed_node
for update
loop
-- ----
-- If the backup node is the only direct subscriber ...
-- ----
if v_row.num_direct_receivers = 0 then
raise notice 'failedNode: set % has no other direct receivers - move now', v_row.set_id;
-- ----
-- backup_node is the only direct subscriber, move the set
-- right now. On the backup node itself that includes restoring
-- all user mode triggers, removing the protection trigger,
-- adding the log trigger, removing the subscription and the
-- obsolete setsync status.
-- ----
if p_backup_node = getLocalNodeId('_schemadoc') then
update sl_set set set_origin = p_backup_node
where set_id = v_row.set_id;
delete from sl_setsync
where ssy_setid = v_row.set_id;
for v_row2 in select * from sl_table
where tab_set = v_row.set_id
order by tab_id
loop
perform alterTableConfigureTriggers(v_row2.tab_id);
end loop;
end if;
delete from sl_subscribe
where sub_set = v_row.set_id
and sub_receiver = p_backup_node;
else
raise notice 'failedNode: set % has other direct receivers - change providers only', v_row.set_id;
-- ----
-- Backup node is not the only direct subscriber or not
-- a direct subscriber at all.
-- This means that at this moment, we redirect all possible
-- direct subscribers to receive from the backup node, and the
-- backup node itself to receive from another one.
-- The admin utility will wait for the slon engine to
-- restart and then call failedNode2() on the node with
-- the highest SYNC and redirect this to it on
-- backup node later.
-- ----
update sl_subscribe
set sub_provider = (select min(SS.sub_receiver)
from sl_subscribe SS
where SS.sub_set = v_row.set_id
and SS.sub_receiver <> p_backup_node
and SS.sub_forward
and exists (
select 1 from sl_path
where pa_server = SS.sub_receiver
and pa_client = p_backup_node
))
where sub_set = v_row.set_id
and sub_receiver = p_backup_node;
update sl_subscribe
set sub_provider = (select min(SS.sub_receiver)
from sl_subscribe SS
where SS.sub_set = v_row.set_id
and SS.sub_receiver <> p_failed_node
and SS.sub_forward
and exists (
select 1 from sl_path
where pa_server = SS.sub_receiver
and pa_client = sl_subscribe.sub_receiver
))
where sub_set = v_row.set_id
and sub_receiver <> p_backup_node;
update sl_subscribe
set sub_provider = p_backup_node
where sub_set = v_row.set_id
and sub_receiver <> p_backup_node
and exists (
select 1 from sl_path
where pa_server = p_backup_node
and pa_client = sl_subscribe.sub_receiver
);
end if;
end loop;
-- Rewrite sl_listen table
perform RebuildListenEntries();
-- Run addPartialLogIndices() to try to add indices to unused sl_log_? table
perform addPartialLogIndices();
-- ----
-- Make sure the node daemon will restart
-- ----
notify "_schemadoc_Restart";
-- ----
-- That is it - so far.
-- ----
return p_failed_node;
end; |