move agents monitor task to go
This commit is contained in:
@@ -16,27 +16,6 @@ from logs.models import PendingAction
|
||||
logger.configure(**settings.LOG_CONFIG)
|
||||
|
||||
|
||||
def _check_agent_service(pk: int) -> None:
|
||||
agent = Agent.objects.get(pk=pk)
|
||||
r = asyncio.run(agent.nats_cmd({"func": "ping"}, timeout=2))
|
||||
# if the agent is respoding to pong from the rpc service but is not showing as online (handled by tacticalagent service)
|
||||
# then tacticalagent service is hung. forcefully restart it
|
||||
if r == "pong":
|
||||
logger.info(
|
||||
f"Detected crashed tacticalagent service on {agent.hostname} v{agent.version}, attempting recovery"
|
||||
)
|
||||
data = {"func": "recover", "payload": {"mode": "tacagent"}}
|
||||
asyncio.run(agent.nats_cmd(data, wait=False))
|
||||
|
||||
|
||||
@app.task
|
||||
def monitor_agents_task() -> None:
|
||||
q = Agent.objects.only("pk", "version", "last_seen", "overdue_time")
|
||||
agents: List[int] = [i.pk for i in q if i.has_nats and i.status != "online"]
|
||||
for agent in agents:
|
||||
_check_agent_service(agent)
|
||||
|
||||
|
||||
def agent_update(pk: int) -> str:
|
||||
agent = Agent.objects.get(pk=pk)
|
||||
# skip if we can't determine the arch
|
||||
|
||||
@@ -8,4 +8,6 @@ urlpatterns = [
|
||||
path("winupdates/", views.NatsWinUpdates.as_view()),
|
||||
path("choco/", views.NatsChoco.as_view()),
|
||||
path("wmi/", views.NatsWMI.as_view()),
|
||||
path("offline/", views.OfflineAgents.as_view()),
|
||||
path("logcrash/", views.LogCrash.as_view()),
|
||||
]
|
||||
|
||||
@@ -256,3 +256,31 @@ class NatsWMI(APIView):
|
||||
if pyver.parse(i.version) >= pyver.parse("1.2.0") and i.status == "online"
|
||||
]
|
||||
return Response({"agent_ids": online})
|
||||
|
||||
|
||||
class OfflineAgents(APIView):
|
||||
authentication_classes = []
|
||||
permission_classes = []
|
||||
|
||||
def get(self, request):
|
||||
agents = Agent.objects.only(
|
||||
"pk", "agent_id", "version", "last_seen", "overdue_time"
|
||||
)
|
||||
offline: List[str] = [
|
||||
i.agent_id for i in agents if i.has_nats and i.status != "online"
|
||||
]
|
||||
return Response({"agent_ids": offline})
|
||||
|
||||
|
||||
class LogCrash(APIView):
|
||||
authentication_classes = []
|
||||
permission_classes = []
|
||||
|
||||
def post(self, request):
|
||||
agent = get_object_or_404(Agent, agent_id=request.data["agentid"])
|
||||
logger.info(
|
||||
f"Detected crashed tacticalagent service on {agent.hostname} v{agent.version}, attempting recovery"
|
||||
)
|
||||
agent.last_seen = djangotime.now()
|
||||
agent.save(update_fields=["last_seen"])
|
||||
return Response("ok")
|
||||
|
||||
@@ -33,10 +33,6 @@ app.conf.beat_schedule = {
|
||||
"task": "agents.tasks.auto_self_agent_update_task",
|
||||
"schedule": crontab(minute=35, hour="*"),
|
||||
},
|
||||
"check-agentservice": {
|
||||
"task": "agents.tasks.monitor_agents_task",
|
||||
"schedule": crontab(minute="*/15"),
|
||||
},
|
||||
"remove-salt": {
|
||||
"task": "agents.tasks.remove_salt_task",
|
||||
"schedule": crontab(minute=14, hour="*/2"),
|
||||
|
||||
@@ -76,6 +76,7 @@ func Listen(apihost, natshost, version string, debug bool) {
|
||||
}
|
||||
|
||||
go getWMI(rClient, nc)
|
||||
go monitorAgents(rClient, nc)
|
||||
|
||||
nc.Subscribe("*", func(msg *nats.Msg) {
|
||||
var mh codec.MsgpackHandle
|
||||
|
||||
Binary file not shown.
@@ -8,6 +8,43 @@ import (
|
||||
"github.com/ugorji/go/codec"
|
||||
)
|
||||
|
||||
func monitorAgents(c *resty.Client, nc *nats.Conn) {
|
||||
var payload, recPayload []byte
|
||||
var mh codec.MsgpackHandle
|
||||
mh.RawToString = true
|
||||
ret := codec.NewEncoderBytes(&payload, new(codec.MsgpackHandle))
|
||||
ret.Encode(map[string]string{"func": "ping"})
|
||||
|
||||
rec := codec.NewEncoderBytes(&recPayload, new(codec.MsgpackHandle))
|
||||
rec.Encode(Recovery{
|
||||
Func: "recover",
|
||||
Data: map[string]string{"mode": "tacagent"},
|
||||
})
|
||||
|
||||
tick := time.NewTicker(10 * time.Minute)
|
||||
for range tick.C {
|
||||
agentids, _ := c.R().SetResult(&AgentIDS{}).Get("/offline/")
|
||||
ids := agentids.Result().(*AgentIDS).IDs
|
||||
var resp string
|
||||
for _, id := range ids {
|
||||
out, err := nc.Request(id, payload, 2*time.Second)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
dec := codec.NewDecoderBytes(out.Data, &mh)
|
||||
if err := dec.Decode(&resp); err == nil {
|
||||
// if the agent is respoding to pong from the rpc service but is not showing as online (handled by tacticalagent service)
|
||||
// then tacticalagent service is hung. forcefully restart it
|
||||
if resp == "pong" {
|
||||
nc.Publish(id, recPayload)
|
||||
p := map[string]string{"agentid": id}
|
||||
c.R().SetBody(p).Post("/logcrash/")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getWMI(c *resty.Client, nc *nats.Conn) {
|
||||
var payload []byte
|
||||
var mh codec.MsgpackHandle
|
||||
|
||||
@@ -8,3 +8,8 @@ type NatsInfo struct {
|
||||
type AgentIDS struct {
|
||||
IDs []string `json:"agent_ids"`
|
||||
}
|
||||
|
||||
type Recovery struct {
|
||||
Func string `json:"func"`
|
||||
Data map[string]string `json:"payload"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user