Files
tacticalrmm/api/tacticalrmm/checks/models.py

902 lines
32 KiB
Python

import asyncio
import string
import os
import json
import pytz
from statistics import mean
from django.utils import timezone as djangotime
from django.db import models
from django.conf import settings
from django.core.validators import MinValueValidator, MaxValueValidator
from django.contrib.postgres.fields import ArrayField
from rest_framework.fields import JSONField
from typing import List, Any
from typing import Union
from loguru import logger
from core.models import CoreSettings
from logs.models import BaseAuditModel
from .tasks import (
handle_check_email_alert_task,
handle_check_sms_alert_task,
handle_resolved_check_email_alert_task,
handle_resolved_check_sms_alert_task,
)
from .utils import bytes2human
from alerts.models import SEVERITY_CHOICES
logger.configure(**settings.LOG_CONFIG)
CHECK_TYPE_CHOICES = [
("diskspace", "Disk Space Check"),
("ping", "Ping Check"),
("cpuload", "CPU Load Check"),
("memory", "Memory Check"),
("winsvc", "Service Check"),
("script", "Script Check"),
("eventlog", "Event Log Check"),
]
CHECK_STATUS_CHOICES = [
("passing", "Passing"),
("failing", "Failing"),
("pending", "Pending"),
]
EVT_LOG_NAME_CHOICES = [
("Application", "Application"),
("System", "System"),
("Security", "Security"),
]
EVT_LOG_TYPE_CHOICES = [
("INFO", "Information"),
("WARNING", "Warning"),
("ERROR", "Error"),
("AUDIT_SUCCESS", "Success Audit"),
("AUDIT_FAILURE", "Failure Audit"),
]
EVT_LOG_FAIL_WHEN_CHOICES = [
("contains", "Log contains"),
("not_contains", "Log does not contain"),
]
class Check(BaseAuditModel):
# common fields
agent = models.ForeignKey(
"agents.Agent",
related_name="agentchecks",
null=True,
blank=True,
on_delete=models.CASCADE,
)
policy = models.ForeignKey(
"automation.Policy",
related_name="policychecks",
null=True,
blank=True,
on_delete=models.CASCADE,
)
managed_by_policy = models.BooleanField(default=False)
overriden_by_policy = models.BooleanField(default=False)
parent_check = models.PositiveIntegerField(null=True, blank=True)
name = models.CharField(max_length=255, null=True, blank=True)
check_type = models.CharField(
max_length=50, choices=CHECK_TYPE_CHOICES, default="diskspace"
)
status = models.CharField(
max_length=100, choices=CHECK_STATUS_CHOICES, default="pending"
)
more_info = models.TextField(null=True, blank=True)
last_run = models.DateTimeField(null=True, blank=True)
email_alert = models.BooleanField(default=False)
text_alert = models.BooleanField(default=False)
dashboard_alert = models.BooleanField(default=False)
fails_b4_alert = models.PositiveIntegerField(default=1)
fail_count = models.PositiveIntegerField(default=0)
outage_history = models.JSONField(null=True, blank=True) # store
extra_details = models.JSONField(null=True, blank=True)
# check specific fields
# for eventlog, script, ip, and service alert severity
alert_severity = models.CharField(
max_length=15,
choices=SEVERITY_CHOICES,
default="warning",
null=True,
blank=True,
)
# threshold percent for diskspace, cpuload or memory check
error_threshold = models.PositiveIntegerField(
validators=[MinValueValidator(0), MaxValueValidator(99)],
null=True,
blank=True,
default=0,
)
warning_threshold = models.PositiveIntegerField(
null=True,
blank=True,
validators=[MinValueValidator(0), MaxValueValidator(99)],
default=0,
)
# diskcheck i.e C:, D: etc
disk = models.CharField(max_length=2, null=True, blank=True)
# ping checks
ip = models.CharField(max_length=255, null=True, blank=True)
# script checks
script = models.ForeignKey(
"scripts.Script",
related_name="script",
on_delete=models.CASCADE,
null=True,
blank=True,
)
script_args = ArrayField(
models.CharField(max_length=255, null=True, blank=True),
null=True,
blank=True,
default=list,
)
info_return_codes = ArrayField(
models.PositiveIntegerField(),
null=True,
blank=True,
default=list,
)
warning_return_codes = ArrayField(
models.PositiveIntegerField(),
null=True,
blank=True,
default=list,
)
timeout = models.PositiveIntegerField(null=True, blank=True)
stdout = models.TextField(null=True, blank=True)
stderr = models.TextField(null=True, blank=True)
retcode = models.IntegerField(null=True, blank=True)
execution_time = models.CharField(max_length=100, null=True, blank=True)
# cpu and mem check history
history = ArrayField(
models.IntegerField(blank=True), null=True, blank=True, default=list
)
# win service checks
svc_name = models.CharField(max_length=255, null=True, blank=True)
svc_display_name = models.CharField(max_length=255, null=True, blank=True)
pass_if_start_pending = models.BooleanField(null=True, blank=True)
pass_if_svc_not_exist = models.BooleanField(default=False)
restart_if_stopped = models.BooleanField(null=True, blank=True)
svc_policy_mode = models.CharField(
max_length=20, null=True, blank=True
) # 'default' or 'manual', for editing policy check
# event log checks
log_name = models.CharField(
max_length=255, choices=EVT_LOG_NAME_CHOICES, null=True, blank=True
)
event_id = models.IntegerField(null=True, blank=True)
event_id_is_wildcard = models.BooleanField(default=False)
event_type = models.CharField(
max_length=255, choices=EVT_LOG_TYPE_CHOICES, null=True, blank=True
)
event_source = models.CharField(max_length=255, null=True, blank=True)
event_message = models.TextField(null=True, blank=True)
fail_when = models.CharField(
max_length=255, choices=EVT_LOG_FAIL_WHEN_CHOICES, null=True, blank=True
)
search_last_days = models.PositiveIntegerField(null=True, blank=True)
def __str__(self):
if self.agent:
return f"{self.agent.hostname} - {self.readable_desc}"
else:
return f"{self.policy.name} - {self.readable_desc}"
@property
def readable_desc(self):
if self.check_type == "diskspace":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
return f"{self.get_check_type_display()}: Drive {self.disk} < {text}"
elif self.check_type == "ping":
return f"{self.get_check_type_display()}: {self.name}"
elif self.check_type == "cpuload" or self.check_type == "memory":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
return f"{self.get_check_type_display()} > {text}"
elif self.check_type == "winsvc":
return f"{self.get_check_type_display()}: {self.svc_display_name}"
elif self.check_type == "eventlog":
return f"{self.get_check_type_display()}: {self.name}"
elif self.check_type == "script":
return f"{self.get_check_type_display()}: {self.script.name}"
else:
return "n/a"
@property
def history_info(self):
if self.check_type == "cpuload" or self.check_type == "memory":
return ", ".join(str(f"{x}%") for x in self.history[-6:])
@property
def last_run_as_timezone(self):
if self.last_run is not None and self.agent is not None:
return self.last_run.astimezone(
pytz.timezone(self.agent.timezone)
).strftime("%b-%d-%Y - %H:%M")
return self.last_run
@property
def non_editable_fields(self) -> List[str]:
return [
"check_type",
"status",
"more_info",
"last_run",
"fail_count",
"outage_history",
"extra_details",
"stdout",
"stderr",
"retcode",
"execution_time",
"history",
"readable_desc",
"history_info",
"parent_check",
"managed_by_policy",
"overriden_by_policy",
"created_by",
"created_time",
"modified_by",
"modified_time",
]
def handle_alert(self) -> None:
from alerts.models import Alert, AlertTemplate
# return if agent is in maintenance mode
if self.agent.maintenance_mode:
return
# see if agent has an alert template and use that
alert_template: Union[AlertTemplate, None] = self.agent.get_alert_template()
# resolve alert if it exists
if self.status == "passing":
if Alert.objects.filter(assigned_check=self, resolved=False).exists():
alert = Alert.objects.get(assigned_check=self, resolved=False)
alert.resolve()
# check if a resolved email notification should be send
if (
alert_template
and alert_template.check_email_on_resolved
and not alert.resolved_email_sent
):
handle_resolved_check_email_alert_task.delay(pk=alert.pk)
# check if resolved text should be sent
if (
alert_template
and alert_template.check_text_on_resolved
and not alert.resolved_sms_sent
):
handle_resolved_check_sms_alert_task.delay(pk=alert.pk)
# check if resolved script should be run
if (
alert_template
and alert_template.resolved_action
and not alert.resolved_action_run
):
r = self.agent.run_script(
scriptpk=alert_template.resolved_action.pk,
args=alert_template.resolved_action_args,
timeout=alert_template.resolved_action_timeout,
wait=True,
full=True,
run_on_any=True,
)
# command was successful
if type(r) == dict:
alert.resolved_action_retcode = r["retcode"]
alert.resolved_action_stdout = r["stdout"]
alert.resolved_action_stderr = r["stderr"]
alert.resolved_action_execution_time = "{:.4f}".format(
r["execution_time"]
)
alert.resolved_action_run = djangotime.now()
alert.save()
else:
logger.error(
f"Resolved action: {alert_template.action.name} failed to run on any agent for {self.agent.hostname} resolved alert for {self.check_type} check"
)
elif self.fail_count >= self.fails_b4_alert:
if not Alert.objects.filter(assigned_check=self, resolved=False).exists():
alert = Alert.create_check_alert(self)
else:
alert = Alert.objects.get(assigned_check=self, resolved=False)
# check if alert severity changed on check and update the alert
if self.alert_severity != alert.severity:
alert.severity = self.alert_severity
alert.save(update_fields=["severity"])
# create alert in dashboard if enabled
if (
self.dashboard_alert
or alert_template
and self.alert_severity in alert_template.check_dashboard_alert_severity
and alert_template.check_always_alert
):
alert.hidden = False
alert.save()
# send email if enabled
if (
not alert.email_sent
and self.email_alert
or alert_template
and self.alert_severity in alert_template.check_email_alert_severity
and alert_template.check_always_email
):
handle_check_email_alert_task.delay(
pk=alert.pk,
alert_interval=alert_template.check_periodic_alert_days
if alert_template
else None,
)
# send text if enabled
if (
not alert.sms_sent
and self.text_alert
or alert_template
and self.alert_severity in alert_template.check_text_alert_severity
and alert_template.check_always_text
):
handle_check_sms_alert_task.delay(
pk=alert.pk,
alert_interval=alert_template.check_periodic_alert_days
if alert_template
else None,
)
# check if any scripts should be run
if alert_template and alert_template.action and not alert.action_run:
r = self.agent.run_script(
scriptpk=alert_template.action.pk,
args=alert_template.action_args,
timeout=alert_template.action_timeout,
wait=True,
full=True,
run_on_any=True,
)
# command was successful
if type(r) == dict:
alert.action_retcode = r["retcode"]
alert.action_stdout = r["stdout"]
alert.action_stderr = r["stderr"]
alert.action_execution_time = "{:.4f}".format(r["execution_time"])
alert.action_run = djangotime.now()
alert.save()
else:
logger.error(
f"Failure action: {alert_template.action.name} failed to run on any agent for {self.agent.hostname} failure alert for {self.check_type} check{r}"
)
def add_check_history(self, value: int, more_info: Any = None) -> None:
CheckHistory.objects.create(check_history=self, y=value, results=more_info)
def handle_checkv2(self, data):
# cpuload or mem checks
if self.check_type == "cpuload" or self.check_type == "memory":
self.history.append(data["percent"])
if len(self.history) > 15:
self.history = self.history[-15:]
self.save(update_fields=["history"])
avg = int(mean(self.history))
if self.error_threshold and avg > self.error_threshold:
self.status = "failing"
self.alert_severity = "error"
elif self.warning_threshold and avg > self.warning_threshold:
self.status = "failing"
self.alert_severity = "warning"
else:
self.status = "passing"
# add check history
self.add_check_history(data["percent"])
# diskspace checks
elif self.check_type == "diskspace":
if data["exists"]:
percent_used = round(data["percent_used"])
total = bytes2human(data["total"])
free = bytes2human(data["free"])
if self.error_threshold and (100 - percent_used) < self.error_threshold:
self.status = "failing"
self.alert_severity = "error"
elif (
self.warning_threshold
and (100 - percent_used) < self.warning_threshold
):
self.status = "failing"
self.alert_severity = "warning"
else:
self.status = "passing"
self.more_info = f"Total: {total}B, Free: {free}B"
# add check history
self.add_check_history(100 - percent_used)
else:
self.status = "failing"
self.alert_severity = "error"
self.more_info = f"Disk {self.disk} does not exist"
self.save(update_fields=["more_info"])
# script checks
elif self.check_type == "script":
self.stdout = data["stdout"]
self.stderr = data["stderr"]
self.retcode = data["retcode"]
try:
# python agent
self.execution_time = "{:.4f}".format(data["stop"] - data["start"])
except:
# golang agent
self.execution_time = "{:.4f}".format(data["runtime"])
if data["retcode"] in self.info_return_codes:
self.alert_severity = "info"
self.status = "failing"
elif data["retcode"] in self.warning_return_codes:
self.alert_severity = "warning"
self.status = "failing"
elif data["retcode"] != 0:
self.status = "failing"
self.alert_severity = "error"
else:
self.status = "passing"
self.save(
update_fields=[
"stdout",
"stderr",
"retcode",
"execution_time",
]
)
# add check history
self.add_check_history(
1 if self.status == "failing" else 0,
{
"retcode": data["retcode"],
"stdout": data["stdout"][:60],
"stderr": data["stderr"][:60],
"execution_time": self.execution_time,
},
)
# ping checks
elif self.check_type == "ping":
success = ["Reply", "bytes", "time", "TTL"]
output = data["output"]
if data["has_stdout"]:
if all(x in output for x in success):
self.status = "passing"
else:
self.status = "failing"
elif data["has_stderr"]:
self.status = "failing"
self.more_info = output
self.save(update_fields=["more_info"])
self.add_check_history(
1 if self.status == "failing" else 0, self.more_info[:60]
)
# windows service checks
elif self.check_type == "winsvc":
svc_stat = data["status"]
self.more_info = f"Status {svc_stat.upper()}"
if data["exists"]:
if svc_stat == "running":
self.status = "passing"
elif svc_stat == "start_pending" and self.pass_if_start_pending:
self.status = "passing"
else:
if self.agent and self.restart_if_stopped:
nats_data = {
"func": "winsvcaction",
"payload": {"name": self.svc_name, "action": "start"},
}
r = asyncio.run(self.agent.nats_cmd(nats_data, timeout=32))
if r == "timeout" or r == "natsdown":
self.status = "failing"
elif not r["success"] and r["errormsg"]:
self.status = "failing"
elif r["success"]:
self.status = "passing"
self.more_info = f"Status RUNNING"
else:
self.status = "failing"
else:
self.status = "failing"
else:
if self.pass_if_svc_not_exist:
self.status = "passing"
else:
self.status = "failing"
self.more_info = f"Service {self.svc_name} does not exist"
self.save(update_fields=["more_info"])
self.add_check_history(
1 if self.status == "failing" else 0, self.more_info[:60]
)
elif self.check_type == "eventlog":
log = []
is_wildcard = self.event_id_is_wildcard
eventType = self.event_type
eventID = self.event_id
source = self.event_source
message = self.event_message
r = data["log"]
for i in r:
if i["eventType"] == eventType:
if not is_wildcard and not int(i["eventID"]) == eventID:
continue
if not source and not message:
if is_wildcard:
log.append(i)
elif int(i["eventID"]) == eventID:
log.append(i)
continue
if source and message:
if is_wildcard:
if source in i["source"] and message in i["message"]:
log.append(i)
elif int(i["eventID"]) == eventID:
if source in i["source"] and message in i["message"]:
log.append(i)
continue
if source and source in i["source"]:
if is_wildcard:
log.append(i)
elif int(i["eventID"]) == eventID:
log.append(i)
if message and message in i["message"]:
if is_wildcard:
log.append(i)
elif int(i["eventID"]) == eventID:
log.append(i)
if self.fail_when == "contains":
if log:
self.status = "failing"
else:
self.status = "passing"
elif self.fail_when == "not_contains":
if log:
self.status = "passing"
else:
self.status = "failing"
self.extra_details = {"log": log}
self.save(update_fields=["extra_details"])
self.add_check_history(
1 if self.status == "failing" else 0,
"Events Found:" + str(len(self.extra_details["log"])),
)
# handle status
if self.status == "failing":
self.fail_count += 1
self.save(update_fields=["status", "fail_count", "alert_severity"])
elif self.status == "passing":
self.fail_count = 0
self.save(update_fields=["status", "fail_count", "alert_severity"])
self.handle_alert()
return self.status
@staticmethod
def serialize(check):
# serializes the check and returns json
from .serializers import CheckSerializer
return CheckSerializer(check).data
# for policy diskchecks
@staticmethod
def all_disks():
return [f"{i}:" for i in string.ascii_uppercase]
# for policy service checks
@staticmethod
def load_default_services():
with open(
os.path.join(settings.BASE_DIR, "services/default_services.json")
) as f:
default_services = json.load(f)
return default_services
def create_policy_check(self, agent=None, policy=None):
if not agent and not policy or agent and policy:
return
Check.objects.create(
agent=agent,
policy=policy,
managed_by_policy=bool(agent),
parent_check=(self.pk if agent else None),
name=self.name,
alert_severity=self.alert_severity,
check_type=self.check_type,
email_alert=self.email_alert,
dashboard_alert=self.dashboard_alert,
text_alert=self.text_alert,
fails_b4_alert=self.fails_b4_alert,
extra_details=self.extra_details,
error_threshold=self.error_threshold,
warning_threshold=self.warning_threshold,
disk=self.disk,
ip=self.ip,
script=self.script,
script_args=self.script_args,
timeout=self.timeout,
info_return_codes=self.info_return_codes,
warning_return_codes=self.warning_return_codes,
svc_name=self.svc_name,
svc_display_name=self.svc_display_name,
pass_if_start_pending=self.pass_if_start_pending,
pass_if_svc_not_exist=self.pass_if_svc_not_exist,
restart_if_stopped=self.restart_if_stopped,
svc_policy_mode=self.svc_policy_mode,
log_name=self.log_name,
event_id=self.event_id,
event_id_is_wildcard=self.event_id_is_wildcard,
event_type=self.event_type,
event_source=self.event_source,
event_message=self.event_message,
fail_when=self.fail_when,
search_last_days=self.search_last_days,
)
def is_duplicate(self, check):
if self.check_type == "diskspace":
return self.disk == check.disk
elif self.check_type == "script":
return self.script == check.script
elif self.check_type == "ping":
return self.ip == check.ip
elif self.check_type == "cpuload":
return True
elif self.check_type == "memory":
return True
elif self.check_type == "winsvc":
return self.svc_name == check.svc_name
elif self.check_type == "eventlog":
return [self.log_name, self.event_id] == [check.log_name, check.event_id]
def send_email(self):
CORE = CoreSettings.objects.first()
alert_template = self.agent.get_alert_template()
body: str = ""
if self.agent:
subject = f"{self.agent.client.name}, {self.agent.site.name}, {self} Failed"
else:
subject = f"{self} Failed"
if self.check_type == "diskspace":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
percent_used = [
d["percent"] for d in self.agent.disks if d["device"] == self.disk
][0]
percent_free = 100 - percent_used
body = subject + f" - Free: {percent_free}%, {text}"
elif self.check_type == "script":
body = (
subject
+ f" - Return code: {self.retcode}\nStdout:{self.stdout}\nStderr: {self.stderr}"
)
elif self.check_type == "ping":
body = self.more_info
elif self.check_type == "cpuload" or self.check_type == "memory":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
avg = int(mean(self.history))
if self.check_type == "cpuload":
body = subject + f" - Average CPU utilization: {avg}%, {text}"
elif self.check_type == "memory":
body = subject + f" - Average memory usage: {avg}%, {text}"
elif self.check_type == "winsvc":
try:
status = list(
filter(lambda x: x["name"] == self.svc_name, self.agent.services)
)[0]["status"]
# catch services that don't exist if policy check
except:
status = "Unknown"
body = subject + f" - Status: {status.upper()}"
elif self.check_type == "eventlog":
if self.event_source and self.event_message:
start = f"Event ID {self.event_id}, source {self.event_source}, containing string {self.event_message} "
elif self.event_source:
start = f"Event ID {self.event_id}, source {self.event_source} "
elif self.event_message:
start = (
f"Event ID {self.event_id}, containing string {self.event_message} "
)
else:
start = f"Event ID {self.event_id} "
body = start + f"was found in the {self.log_name} log\n\n"
for i in self.extra_details["log"]:
try:
if i["message"]:
body += f"{i['message']}\n"
except:
continue
CORE.send_mail(subject, body, alert_template=alert_template)
def send_sms(self):
CORE = CoreSettings.objects.first()
alert_template = self.agent.get_alert_template()
body: str = ""
if self.agent:
subject = f"{self.agent.client.name}, {self.agent.site.name}, {self} Failed"
else:
subject = f"{self} Failed"
if self.check_type == "diskspace":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
percent_used = [
d["percent"] for d in self.agent.disks if d["device"] == self.disk
][0]
percent_free = 100 - percent_used
body = subject + f" - Free: {percent_free}%, {text}"
elif self.check_type == "script":
body = subject + f" - Return code: {self.retcode}"
elif self.check_type == "ping":
body = subject
elif self.check_type == "cpuload" or self.check_type == "memory":
text = ""
if self.warning_threshold:
text += f" Warning Threshold: {self.warning_threshold}%"
if self.error_threshold:
text += f" Error Threshold: {self.error_threshold}%"
avg = int(mean(self.history))
if self.check_type == "cpuload":
body = subject + f" - Average CPU utilization: {avg}%, {text}"
elif self.check_type == "memory":
body = subject + f" - Average memory usage: {avg}%, {text}"
elif self.check_type == "winsvc":
status = list(
filter(lambda x: x["name"] == self.svc_name, self.agent.services)
)[0]["status"]
body = subject + f" - Status: {status.upper()}"
elif self.check_type == "eventlog":
body = subject
CORE.send_sms(body, alert_template=alert_template)
def send_resolved_email(self):
CORE = CoreSettings.objects.first()
alert_template = self.agent.get_alert_template()
subject = f"{self.agent.client.name}, {self.agent.site.name}, {self} Resolved"
body = f"{self} is now back to normal"
CORE.send_mail(subject, body, alert_template=alert_template)
def send_resolved_sms(self):
CORE = CoreSettings.objects.first()
alert_template = self.agent.get_alert_template()
subject = f"{self.agent.client.name}, {self.agent.site.name}, {self} Resolved"
CORE.send_sms(subject, alert_template=alert_template)
class CheckHistory(models.Model):
check_history = models.ForeignKey(
Check,
related_name="check_history",
on_delete=models.CASCADE,
)
x = models.DateTimeField(auto_now_add=True)
y = models.PositiveIntegerField(null=True, blank=True, default=None)
results = models.JSONField(null=True, blank=True)
def __str__(self):
return self.check_history.readable_desc