Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Detecting clock skew
/*******************************************************************************
* This file is part of OpenNMS(R).
*
* Copyright (C) 2019 The OpenNMS Group, Inc.
* OpenNMS(R) is Copyright (C) 1999-2019 The OpenNMS Group, Inc.
*
* OpenNMS(R) is a registered trademark of The OpenNMS Group, Inc.
*
* OpenNMS(R) is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License,
* or (at your option) any later version.
*
* OpenNMS(R) is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with OpenNMS(R). If not, see:
* http://www.gnu.org/licenses/
*
* For more information contact:
* OpenNMS(R) Licensing <license@opennms.org>
* http://www.opennms.org/
* http://www.opennms.com/
*******************************************************************************/
package org.opennms.netmgt.alarmd.usecases;
import java.util.Date;
import java.util.List;
import org.kie.api.time.SessionClock;
import org.opennms.netmgt.model.OnmsAlarm;
import org.opennms.netmgt.model.OnmsEvent;
import org.opennms.netmgt.model.OnmsEventParameter;
import org.opennms.netmgt.model.OnmsSeverity;
import org.opennms.netmgt.model.TroubleTicketState
import org.opennms.netmgt.model.events.EventBuilder;
global org.opennms.netmgt.alarmd.drools.AlarmService alarmService;
/**
This ruleset can be used to passively detect clock skew in network devices
by inspecting the timestamps of alarms triggered by syslog messages.
To help ensure the results are accurate, we only trigger alarms if there is at least one recent non-skewed alarm
for some node. This implies that the system is functioning properly for some subset of the nodes.
The following event definition should be added to the system for the associated alarm to be created:
<event>
<uei>uei.opennms.org/nodes/clockSkewDetected</uei>
<event-label>Clock Skew Detected</event-label>
<logmsg dest="logndisplay">Clock skew was detected: %parm[maxSkewMs]% ms</logmsg>
<descr>Clock skew was passively detected with min: %parm[minSkewMs]% ms, max: %parm[maxSkewMs]% ms and avg: %parm[avgSkewMs]% ms.</descr>
<severity>Major</severity>
<alarm-data reduction-key="%uei%:%nodeid%" alarm-type="3" />
<parameter name="doNotCorrelate" value="true" expand="false"/>
</event>
*/
declare org.opennms.netmgt.model.OnmsAlarm
@role(event)
@timestamp(lastUpdateTime)
end
declare SkewedAlarm
@role(event)
@expires(1h)
@timestamp(createdAt)
skew: long
alarmId: int
nodeId: int
createdAt: Date
end
declare TriggeredClockSkewedAlarm
@role(event)
@expires(24h)
@timestamp(createdAt)
nodeId: int
createdAt: Date
end
declare ExistsNonSkewedAlarms
@role(event)
@expires(15m)
@timestamp(createdAt)
createdAt: Date
end
function long getSkew(OnmsEvent event) {
if (event.getEventTime() == null || event.getEventCreateTime() == null) {
return 0;
}
return Math.abs(event.getEventTime().getTime() - event.getEventCreateTime().getTime());
}
rule "validate that some alarms are not skewed"
when
not(ExistsNonSkewedAlarms())
$alarm : OnmsAlarm(alarmType == OnmsAlarm.PROBLEM_TYPE,
severity.isGreaterThan(OnmsSeverity.CLEARED),
lastEvent != null,
lastEvent.eventSource == "syslogd",
getSkew(lastEvent) <= 30000)
then
alarmService.info("Found some un-skewed alarm: {}.", $alarm);
Date now = new Date(drools.getWorkingMemory().getSessionClock().getCurrentTime());
insert(new ExistsNonSkewedAlarms(now));
end
rule "detect skewed alarms"
when
ExistsNonSkewedAlarms()
$alarm : OnmsAlarm(alarmType == OnmsAlarm.PROBLEM_TYPE,
severity.isGreaterThan(OnmsSeverity.CLEARED),
nodeId != null,
lastEvent != null,
lastEvent.eventSource == "syslogd",
getSkew(lastEvent) >= 30000)
not(SkewedAlarm( alarmId == $alarm.id )) // only add one skewed alarm for every alarm
not(SkewedAlarm( nodeId == $alarm.nodeId ) over window:time( 10m )) // only track one skewed alarm for the node over a 10 minute window
then
long skew = getSkew($alarm.getLastEvent());
alarmService.info("Found skewed alarm: {} on node: {} with skew: {}.", $alarm, $alarm.getNodeId(), skew);
Date now = new Date(drools.getWorkingMemory().getSessionClock().getCurrentTime());
SkewedAlarm skewedAlarm = new SkewedAlarm();
skewedAlarm.setSkew(getSkew($alarm.getLastEvent()));
skewedAlarm.setAlarmId($alarm.getId());
skewedAlarm.setNodeId($alarm.getNodeId());
skewedAlarm.setCreatedAt(now);
insert(skewedAlarm);
end
rule "trigger alarm for skew"
when
ExistsNonSkewedAlarms()
$skewedAlarm : SkewedAlarm()
not(TriggeredClockSkewedAlarm(nodeId == $skewedAlarm.nodeId)) // don't fire if we've already triggered the alarm
accumulate( $s : SkewedAlarm( nodeId == $skewedAlarm.nodeId, $skew: skew ) over window:time( 1h ),
$skewedAlarms : collectList( $s ),
$max : max( $skew ),
$min : min( $skew ),
$avg : average( $skew ))
eval($skewedAlarms.size() >= 2) // there are at least 2 skewed alarms in the last hour for this node
then
alarmService.info("Triggering skew alarms for node: {} with skew: {}", $skewedAlarm.getNodeId(), $avg);
Date now = new Date(drools.getWorkingMemory().getSessionClock().getCurrentTime());
EventBuilder eb = new EventBuilder("uei.opennms.org/nodes/clockSkewDetected", "clockSkewDetection");
// Use the time from the session clock on the event
eb.setTime(now);
// Set the node id
eb.setNodeid($skewedAlarm.getNodeId());
// Add the skew details
eb.setParam("minSkewMs", String.format("%d", $min));
eb.setParam("maxSkewMs", String.format("%d", $max));
eb.setParam("avgSkewMs", String.format("%.2f", $avg));
// Asynchronously send the event
alarmService.sendEvent(eb.getEvent());
// Add a fact so that we don't immediately re-trigger the alarm again
TriggeredClockSkewedAlarm triggered = new TriggeredClockSkewedAlarm();
triggered.setCreatedAt(now);
triggered.setNodeId($skewedAlarm.getNodeId());
insert(triggered);
end
/*******************************************************************************
* This file is part of OpenNMS(R).
*
* Copyright (C) 2019 The OpenNMS Group, Inc.
* OpenNMS(R) is Copyright (C) 1999-2019 The OpenNMS Group, Inc.
*
* OpenNMS(R) is a registered trademark of The OpenNMS Group, Inc.
*
* OpenNMS(R) is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License,
* or (at your option) any later version.
*
* OpenNMS(R) is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with OpenNMS(R). If not, see:
* http://www.gnu.org/licenses/
*
* For more information contact:
* OpenNMS(R) Licensing <license@opennms.org>
* http://www.opennms.org/
* http://www.opennms.com/
*******************************************************************************/
package org.opennms.netmgt.alarmd.examples;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.mockito.ArgumentCaptor;
import org.opennms.core.test.OpenNMSJUnit4ClassRunner;
import org.opennms.core.time.PseudoClock;
import org.opennms.core.utils.ConfigFileConstants;
import org.opennms.netmgt.alarmd.drools.DefaultAlarmService;
import org.opennms.netmgt.alarmd.drools.DroolsAlarmContext;
import org.opennms.netmgt.dao.api.AcknowledgmentDao;
import org.opennms.netmgt.dao.api.AlarmDao;
import org.opennms.netmgt.dao.mock.MockTransactionTemplate;
import org.opennms.netmgt.dao.support.AlarmEntityNotifierImpl;
import org.opennms.netmgt.events.api.EventForwarder;
import org.opennms.netmgt.model.OnmsAlarm;
import org.opennms.netmgt.model.OnmsEvent;
import org.opennms.netmgt.model.OnmsNode;
import org.opennms.netmgt.model.OnmsSeverity;
import org.opennms.netmgt.model.events.EventBuilder;
import org.opennms.netmgt.xml.event.Event;
import org.opennms.test.JUnitConfigurationEnvironment;
import org.springframework.test.context.ContextConfiguration;
@RunWith(OpenNMSJUnit4ClassRunner.class)
@ContextConfiguration(locations={
"classpath:/META-INF/opennms/emptyContext.xml"
})
@JUnitConfigurationEnvironment
public class ClockSkewDetectionIT {
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
private DroolsAlarmContext dac;
private AlarmDao alarmDao;
private EventForwarder eventForwarder;
private AtomicInteger alarmIdGenerator = new AtomicInteger();
/**
* FIXME: Remove boilerplate
* @throws IOException
*/
@Before
public void setUp() throws IOException {
// Copy default set of rules to a new folder
File rulesFolder = temporaryFolder.newFolder("rules");
FileUtils.copyDirectory(DroolsAlarmContext.getDefaultRulesFolder(), rulesFolder);
// Copy the rules from the example folder alongside the rules in our temporary folder
FileUtils.copyFile(Paths.get(ConfigFileConstants.getHome(), "etc", "examples",
"alarmd", "drools-rules.d", "clock-skew.drl").toFile(),
new File(rulesFolder, "clock-skew.drl"));
// Wire up the engine with mocks
dac = new DroolsAlarmContext(rulesFolder);
dac.setUsePseudoClock(true);
dac.setUseManualTick(true);
MockTransactionTemplate transactionTemplate = new MockTransactionTemplate();
transactionTemplate.afterPropertiesSet();
dac.setTransactionTemplate(transactionTemplate);
alarmDao = mock(AlarmDao.class);
dac.setAlarmDao(alarmDao);
DefaultAlarmService alarmService = new DefaultAlarmService();
alarmService.setAlarmDao(alarmDao);
eventForwarder = mock(EventForwarder.class);
alarmService.setEventForwarder(eventForwarder);
AcknowledgmentDao acknowledgmentDao = mock(AcknowledgmentDao.class);
when(acknowledgmentDao.findLatestAckForRefId(any(Integer.class))).thenReturn(Optional.empty());
alarmService.setAcknowledgmentDao(acknowledgmentDao);
AlarmEntityNotifierImpl alarmEntityNotifier = mock(AlarmEntityNotifierImpl.class);
alarmService.setAlarmEntityNotifier(alarmEntityNotifier);
dac.setAlarmService(alarmService);
dac.setAcknowledgmentDao(acknowledgmentDao);
dac.start();
dac.waitUntilStarted();
}
public class SimulatedNode {
private final int nodeId;
private final long clockSkewInMs;
private final List<OnmsAlarm> alarms = new ArrayList<>();
public SimulatedNode(int nodeId, long clockSkewInMs) {
this.nodeId = nodeId;
this.clockSkewInMs = clockSkewInMs;
}
public void triggerLinkDownAlarms() {
OnmsAlarm syslogLinkDown = createLinkDownAlarm("syslogd", true);
when(alarmDao.get(syslogLinkDown.getId())).thenReturn(syslogLinkDown);
dac.handleNewOrUpdatedAlarm(syslogLinkDown);
alarms.add(syslogLinkDown);
OnmsAlarm trapLinkDown = createLinkDownAlarm("trapd", false);
when(alarmDao.get(trapLinkDown.getId())).thenReturn(trapLinkDown);
dac.handleNewOrUpdatedAlarm(trapLinkDown);
alarms.add(trapLinkDown);
}
public void deleteLinkDownAlarms() {
for (OnmsAlarm alarm : alarms) {
dac.handleDeletedAlarm(alarm.getId(), alarm.getReductionKey());
}
alarms.clear();
}
private OnmsAlarm createLinkDownAlarm(String source, boolean applySkew) {
final long now = PseudoClock.getInstance().getTime();
final OnmsEvent event = new OnmsEvent();
event.setEventSource(source);
event.setEventCreateTime(new Date(now));
if (applySkew) {
event.setEventTime(new Date(now - clockSkewInMs));
} else {
event.setEventTime(new Date(now));
}
final int alarmId = alarmIdGenerator.incrementAndGet();
final OnmsAlarm alarm = new OnmsAlarm();
alarm.setId(alarmId);
alarm.setAlarmType(1);
alarm.setSeverity(OnmsSeverity.WARNING);
alarm.setReductionKey("uei.opennms.org/linkDown/alarm:" + alarmId);
alarm.setFirstEventTime(event.getEventTime());
alarm.setLastEvent(event);
final OnmsNode node = mock(OnmsNode.class);
when(node.getId()).thenReturn(nodeId);
alarm.setNode(node);
when(alarmDao.get(alarm.getId())).thenReturn(alarm);
return alarm;
}
}
@Test
public void canDetectClockSkew() {
// Simulate two nodes, one with a clock skew and one without
SimulatedNode n1 = new SimulatedNode(1, 0);
SimulatedNode n2 = new SimulatedNode(2, TimeUnit.SECONDS.toMillis(31));
// Advance the time
dac.getClock().advanceTime( 60, TimeUnit.SECONDS );
PseudoClock.getInstance().advanceTime(60, TimeUnit.SECONDS);
// Trigger link down alarms on each node
n1.triggerLinkDownAlarms();
n2.triggerLinkDownAlarms();
dac.tick();
// No events should have been generated
verify(eventForwarder, times(0)).sendNow(any(Event.class));
// Advance the time
dac.getClock().advanceTime( 60, TimeUnit.SECONDS );
PseudoClock.getInstance().advanceTime(60, TimeUnit.SECONDS);
// Delete the link down alarms
n1.deleteLinkDownAlarms();
n2.deleteLinkDownAlarms();
dac.tick();
// No events should have been generated
verify(eventForwarder, times(0)).sendNow(any(Event.class));
// Advance the time 30 minutes
dac.getClock().advanceTime( 30, TimeUnit.MINUTES );
PseudoClock.getInstance().advanceTime(30, TimeUnit.MINUTES);
// Trigger link down alarms on each node
n1.triggerLinkDownAlarms();
n2.triggerLinkDownAlarms();
dac.tick();
// Verify
ArgumentCaptor<Event> eventCaptor = ArgumentCaptor.forClass(Event.class);
verify(eventForwarder).sendNow(eventCaptor.capture());
Event event = eventCaptor.getValue();
assertThat(event.getUei(), equalTo("uei.opennms.org/nodes/clockSkewDetected"));
reset(eventForwarder);
// Advance the time 30 minutes
dac.getClock().advanceTime( 30, TimeUnit.MINUTES );
PseudoClock.getInstance().advanceTime(30, TimeUnit.MINUTES);
dac.tick();
// No new events should have been generated
verify(eventForwarder, times(0)).sendNow(any(Event.class));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.