process models in task
Some checks failed
Build and Push Docker Image / build (push) Has been cancelled

This commit is contained in:
Corban-Lee Jones 2024-10-31 13:10:27 +00:00
parent b8f1ffb8d9
commit eb97dca5c6
2 changed files with 220 additions and 23 deletions

View File

@ -7,28 +7,30 @@ import json
import asyncio import asyncio
import logging import logging
import datetime import datetime
import traceback
from os import getenv from os import getenv
from time import perf_counter from time import perf_counter
from collections import deque from collections import deque
import aiohttp # import aiohttp
import httpx import httpx
from aiocache import Cache import feedparser
import discord
# from aiocache import Cache
from discord import TextChannel from discord import TextChannel
from discord import app_commands, Interaction from discord import app_commands, Interaction
from discord.ext import commands, tasks from discord.ext import commands, tasks
from discord.errors import Forbidden from discord.errors import Forbidden
from feedparser import parse
import models import models
from feed import RSSFeed, Subscription, RSSItem, GuildSettings # from feed import RSSFeed, Subscription, RSSItem, GuildSettings
from utils import get_unparsed_feed # from utils import get_unparsed_feed
from filters import match_text # from filters import match_text
from api import API from api import API
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
cache = Cache(Cache.MEMORY) # cache = Cache(Cache.MEMORY)
BATCH_SIZE = 100 BATCH_SIZE = 100
@ -86,9 +88,9 @@ class TaskCog(commands.Cog):
try: try:
await self.do_task() await self.do_task()
except Exception as error: except Exception as exc:
log.error(error.with_traceback()) log.exception(exc)
await inter.followup.send(str(error)) await inter.followup.send(str(exc) or "unknown error")
finally: finally:
end_time = perf_counter() end_time = perf_counter()
await inter.followup.send(f"completed command in {end_time - start_time:.4f} seconds") await inter.followup.send(f"completed command in {end_time - start_time:.4f} seconds")
@ -152,6 +154,7 @@ class TaskCog(commands.Cog):
async def process_server(self, server: models.Server, client: httpx.AsyncClient): async def process_server(self, server: models.Server, client: httpx.AsyncClient):
log.debug(f"processing server: {server.name}") log.debug(f"processing server: {server.name}")
start_time = perf_counter()
subscriptions = await self.get_subscriptions(server, client) subscriptions = await self.get_subscriptions(server, client)
for subscription in subscriptions: for subscription in subscriptions:
@ -169,15 +172,35 @@ class TaskCog(commands.Cog):
] ]
await asyncio.gather(*tasks) await asyncio.gather(*tasks)
end_time = perf_counter()
log.debug(f"Finished processing server: {server.name} in {end_time - start_time:.4f} seconds")
async def process_subscription(self, subscription: models.Subscription, client: httpx.AsyncClient): async def process_subscription(self, subscription: models.Subscription, client: httpx.AsyncClient):
log.debug(f"processing subscription {subscription.name}") log.debug(f"processing subscription {subscription.name}")
start_time = perf_counter()
content = await client.get(subscription.url) raw_rss_content = await subscription.get_rss_content(client)
log.debug(content) if not raw_rss_content:
return
channels = await subscription.get_discord_channels(self.bot)
contents = models.Content.from_raw_rss(raw_rss_content, subscription)
valid_contents, invalid_contents = subscription.filter_entries(contents)
for content in valid_contents:
await self.process_content(content, channels)
tasks = [channel.send(content.item_title) for channel in channels]
asyncio.gather(*tasks)
end_time = perf_counter()
log.debug(f"Finished processing subscription: {subscription.name} in {end_time - start_time:.4f}")
async def process_valid_contents(contents: list[models.Content], channels: list[discord.TextChannel], client: httpx.AsyncClient):
semaphore = asyncio.Semaphore(5)
async def batch_process(content: models.Content, )
# @group.command(name="trigger") # @group.command(name="trigger")
# async def cmd_trigger_task(self, inter): # async def cmd_trigger_task(self, inter):

View File

@ -1,10 +1,16 @@
import re
import logging import logging
import hashlib
from enum import Enum from enum import Enum
from datetime import datetime from datetime import datetime
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import httpx
import discord
import rapidfuzz
import feedparser
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -40,13 +46,13 @@ class Server(DjangoDataModel):
class MatchingAlgorithm(Enum): class MatchingAlgorithm(Enum):
MATCH_NONE = 0 NONE = 0
MATCH_ANY = 1 ANY = 1
MATCH_ALL = 2 ALL = 2
MATCH_LITERAL = 3 LITERAL = 3
MATCH_REGEX = 4 REGEX = 4
MATCH_FUZZY = 5 FUZZY = 5
MATCH_AUTO = 6 AUTO = 6
@classmethod @classmethod
def from_value(cls, value: int): def from_value(cls, value: int):
@ -54,7 +60,7 @@ class MatchingAlgorithm(Enum):
if member.value == value: if member.value == value:
return member return member
raise ValueError(f"No {self.__class__.__name__} for value: {value}") raise ValueError(f"No {cls.__class__.__name__} for value: {value}")
@ -76,6 +82,96 @@ class ContentFilter(DjangoDataModel):
item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm")) item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm"))
return item return item
@property
def _regex_flags(self):
return re.IGNORECASE if self.is_insensitive else 0
@property
def cleaned_matching_pattern(self):
"""
Splits the pattern to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(self.matching_pattern)
]
def _match_any(self, matching_against: str):
for word in self.cleaned_matching_pattern:
if re.search(rf"\b{word}\b", matching_against, self._regex_flags):
return True
return False
def _match_all(self, matching_against: str):
for word in self.cleaned_matching_pattern:
if re.search(rf"\b{word}\b", matching_against, self._regex_flags):
return False
return True
def _match_literal(self, matching_against: str):
return bool(
re.search(
rf"\b{re.escape(self.matching_pattern)}\b",
matching_against,
self._regex_flags
)
)
def _match_regex(self, matching_against: str):
try:
return bool(re.search(
re.compile(self.matching_pattern, self._regex_flags),
matching_against
))
except re.error as exc:
log.error(f"Filter regex error: {exc}")
return False
def _match_fuzzy(self, matching_against: str):
matching_against = re.sub(r"[^\w\s]", "", matching_against)
matching_pattern = re.sub(r"[^\w\s]", "", self.matching_pattern)
if self.is_insensitive:
matching_against = matching_against.lower()
matching_pattern = matching_pattern.lower()
return rapidfuzz.fuzz.partial_ratio(
matching_against,
matching_pattern,
score_cutoff=90
)
def _get_algorithm_func(self):
match self.matching_algorithm:
case MatchingAlgorithm.NONE: return
case MatchingAlgorithm.ANY: return self._match_any
case MatchingAlgorithm.ALL: return self._match_all
case MatchingAlgorithm.LITERAL: return self._match_literal
case MatchingAlgorithm.REGEX: return self._match_regex
case MatchingAlgorithm.FUZZY: return self._match_fuzzy
case _: return
def matches(self, content) -> bool:
log.debug(f"applying filter: {self}")
if not self.matching_pattern.strip():
return False
algorithm_func = self._get_algorithm_func()
if not algorithm_func:
log.error(f"Bad algorithm function: {self.matching_algorithm}")
return False
match_found = algorithm_func(content.item_title) or algorithm_func(content.item_description)
log.debug(f"filter match found: {match_found}")
return not match_found if self.is_whitelist else match_found
@dataclass(slots=True) @dataclass(slots=True)
class MessageMutator(DjangoDataModel): class MessageMutator(DjangoDataModel):
@ -175,6 +271,49 @@ class Subscription(DjangoDataModel):
def server(self, server: server): def server(self, server: server):
self._server = server self._server = server
async def get_rss_content(self, client: httpx.AsyncClient) -> str:
try:
response = await client.get(self.url)
response.raise_for_status()
except httpx.HTTPError as exc:
log.error("(%s) HTTP Exception for %s - %s", type(exc), exc.request.url, exc)
return
content_type = response.headers.get("Content-Type")
if not "text/xml" in content_type:
log.warning("Invalid 'Content-Type' header: %s (must contain 'text/xml')", content_type)
return
return response.text
async def get_discord_channels(self, bot) -> list:
channels = []
for channel_detail in self.channels:
try:
channel = bot.get_channel(channel_detail.id)
channels.append(channel or await bot.fetch_channel(channel_detail.id))
except discord.Forbidden:
log.error(f"Forbidden channel: ({channel.name}, {channel.id}) from ({self.server.name}, {self.server.id})")
return channels
def filter_entries(self, contents: list) -> tuple[list, list]:
log.debug(f"filtering entries for {self.name} in {self.server.name}")
valid_contents = []
invalid_contents = []
for content in contents:
log.debug(f"filtering: '{content.item_title}'")
if any(content_filter.matches(content) for content_filter in self.filters):
invalid_contents.append(content)
else:
valid_contents.append(content)
log.debug(f"filtered content: valid:{len(valid_contents)}, invalid:{len(invalid_contents)}")
return valid_contents, invalid_contents
@dataclass(slots=True) @dataclass(slots=True)
class Content(DjangoDataModel): class Content(DjangoDataModel):
@ -184,10 +323,45 @@ class Content(DjangoDataModel):
item_guid: str item_guid: str
item_url: str item_url: str
item_title: str item_title: str
item_content_hash: str item_description: str
_subscription: Subscription | None = None
@staticmethod @staticmethod
def parser(item: dict) -> dict: def parser(item: dict) -> dict:
item["id"] = item.pop("id") item["id"] = item.pop("id")
item["server_id"] = item.pop("server") item["subscription_id"] = item.pop("subscription")
return item return item
@classmethod
def from_raw_rss(cls, raw_rss_content: str, subscription: Subscription):
parsed_rss = feedparser.parse(raw_rss_content)
contents = []
for entry in parsed_rss.entries:
# content_hash = hashlib.new("sha256")
# content_hash.update(entry.get("description", "").encode())
# content_hash.hexdigest()
data = {
"id": -1,
"subscription": subscription.id,
"item_id": entry.get("id", ""),
"item_guid": entry.get("guid", ""),
"item_url": entry.get("link", ""),
"item_title": entry.get("title", ""),
"item_description": entry.get("description", "")
}
content = Content.from_dict(data)
content.subscription = subscription
contents.append(content)
return contents
@property
def subscription(self) -> Subscription:
return self._subscription
@subscription.setter
def subscription(self, subscription: Subscription):
self._subscription = subscription