import re import logging import hashlib from enum import Enum from datetime import datetime from abc import ABC, abstractmethod from dataclasses import dataclass import httpx import discord import rapidfuzz import feedparser log = logging.getLogger(__name__) @dataclass class DjangoDataModel(ABC): @staticmethod @abstractmethod def parser(item: dict) -> dict: return item @classmethod def from_list(cls, data: list[dict]) -> list: return [cls(**cls.parser(item)) for item in data] @classmethod def from_dict(cls, data: dict): return cls(**cls.parser(data)) @dataclass(slots=True) class Server(DjangoDataModel): id: int name: str icon_hash: str is_bot_operational: bool active: bool @staticmethod def parser(item: dict) -> dict: item["id"] = int(item.pop("id")) return item class MatchingAlgorithm(Enum): NONE = 0 ANY = 1 ALL = 2 LITERAL = 3 REGEX = 4 FUZZY = 5 AUTO = 6 @classmethod def from_value(cls, value: int): for member in cls: if member.value == value: return member raise ValueError(f"No {cls.__class__.__name__} for value: {value}") @dataclass(slots=True) class ContentFilter(DjangoDataModel): id: int server_id: int name: str matching_pattern: str matching_algorithm: MatchingAlgorithm is_insensitive: bool is_whitelist: bool @staticmethod def parser(item: dict) -> dict: item["id"] = item.pop("id") item["server_id"] = item.pop("server") item["matching_pattern"] = item.pop("match") item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm")) return item @property def _regex_flags(self): return re.IGNORECASE if self.is_insensitive else 0 @property def cleaned_matching_pattern(self): """ Splits the pattern to individual keywords, getting rid of unnecessary spaces and grouping quoted words together. """ findterms = re.compile(r'"([^"]+)"|(\S+)').findall normspace = re.compile(r"\s+").sub return [ re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+") for t in findterms(self.matching_pattern) ] def _match_any(self, matching_against: str): for word in self.cleaned_matching_pattern: if re.search(rf"\b{word}\b", matching_against, self._regex_flags): return True return False def _match_all(self, matching_against: str): for word in self.cleaned_matching_pattern: if re.search(rf"\b{word}\b", matching_against, self._regex_flags): return False return True def _match_literal(self, matching_against: str): return bool( re.search( rf"\b{re.escape(self.matching_pattern)}\b", matching_against, self._regex_flags ) ) def _match_regex(self, matching_against: str): try: return bool(re.search( re.compile(self.matching_pattern, self._regex_flags), matching_against )) except re.error as exc: log.error(f"Filter regex error: {exc}") return False def _match_fuzzy(self, matching_against: str): matching_against = re.sub(r"[^\w\s]", "", matching_against) matching_pattern = re.sub(r"[^\w\s]", "", self.matching_pattern) if self.is_insensitive: matching_against = matching_against.lower() matching_pattern = matching_pattern.lower() return rapidfuzz.fuzz.partial_ratio( matching_against, matching_pattern, score_cutoff=90 ) def _get_algorithm_func(self): match self.matching_algorithm: case MatchingAlgorithm.NONE: return case MatchingAlgorithm.ANY: return self._match_any case MatchingAlgorithm.ALL: return self._match_all case MatchingAlgorithm.LITERAL: return self._match_literal case MatchingAlgorithm.REGEX: return self._match_regex case MatchingAlgorithm.FUZZY: return self._match_fuzzy case _: return def matches(self, content) -> bool: log.debug(f"applying filter: {self}") if not self.matching_pattern.strip(): return False algorithm_func = self._get_algorithm_func() if not algorithm_func: log.error(f"Bad algorithm function: {self.matching_algorithm}") return False match_found = algorithm_func(content.item_title) or algorithm_func(content.item_description) log.debug(f"filter match found: {match_found}") return not match_found if self.is_whitelist else match_found @dataclass(slots=True) class MessageMutator(DjangoDataModel): id: int name: str value: str @staticmethod def parser(item: dict) -> dict: item["id"] = item.pop("id") return item @dataclass(slots=True) class MessageStyle(DjangoDataModel): id: int server_id: int name: str colour: str is_embed: bool is_hyperlinked: bool show_author: bool show_timestamp: bool show_images: bool fetch_images: bool title_mutator: dict | None description_mutator: dict | None auto_created: bool @staticmethod def parser(item: dict) -> dict: item["id"] = int(item.pop("id")) item["server_id"] = int(item.pop("server")) item["title_mutator"] = item.pop("title_mutator_detail") item["description_mutator"] = item.pop("description_mutator_detail") return item @dataclass(slots=True) class UniqueContentRule(DjangoDataModel): id: int name: str value: str @staticmethod def parser(item: dict) -> dict: item["id"] = int(item.pop("id")) return item @dataclass(slots=True) class DiscordChannel(DjangoDataModel): id: int name: str is_nsfw: bool @staticmethod def parser(item: dict) -> dict: item["id"] = int(item.pop("id")) return item @dataclass(slots=True) class Subscription(DjangoDataModel): id: int server_id: int name: str url: str created_at: datetime updated_at: datetime extra_notes: str active: bool publish_threshold: datetime channels: list[DiscordChannel] filters: list[ContentFilter] message_style: MessageStyle unique_rules: UniqueContentRule _server: Server | None = None @staticmethod def parser(item: dict) -> dict: item["id"] = int(item.pop("id")) item["server_id"] = int(item.pop("server")) item["created_at"] = datetime.strptime(item.pop("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z") item["updated_at"] = datetime.strptime(item.pop("updated_at"), "%Y-%m-%dT%H:%M:%S.%f%z") item["publish_threshold"] = datetime.strptime(item.pop("publish_threshold"), "%Y-%m-%dT%H:%M:%S%z") item["channels"] = DiscordChannel.from_list(item.pop("channels_detail")) item["filters"] = ContentFilter.from_list(item.pop("filters_detail")) item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail")) item["unique_rules"] = UniqueContentRule.from_list(item.pop("unique_rules_detail")) return item @property def server(self) -> Server: return self._server @server.setter def server(self, server: server): self._server = server async def get_rss_content(self, client: httpx.AsyncClient) -> str: try: response = await client.get(self.url) response.raise_for_status() except httpx.HTTPError as exc: log.error("(%s) HTTP Exception for %s - %s", type(exc), exc.request.url, exc) return content_type = response.headers.get("Content-Type") if not "text/xml" in content_type: log.warning("Invalid 'Content-Type' header: %s (must contain 'text/xml')", content_type) return return response.text async def get_discord_channels(self, bot) -> list: channels = [] for channel_detail in self.channels: try: channel = bot.get_channel(channel_detail.id) channels.append(channel or await bot.fetch_channel(channel_detail.id)) except discord.Forbidden: log.error(f"Forbidden channel: ({channel.name}, {channel.id}) from ({self.server.name}, {self.server.id})") return channels def filter_entries(self, contents: list) -> tuple[list, list]: log.debug(f"filtering entries for {self.name} in {self.server.name}") valid_contents = [] invalid_contents = [] for content in contents: log.debug(f"filtering: '{content.item_title}'") if any(content_filter.matches(content) for content_filter in self.filters): invalid_contents.append(content) else: valid_contents.append(content) log.debug(f"filtered content: valid:{len(valid_contents)}, invalid:{len(invalid_contents)}") return valid_contents, invalid_contents @dataclass(slots=True) class Content(DjangoDataModel): id: int subscription_id: int item_id: str item_guid: str item_url: str item_title: str item_description: str _subscription: Subscription | None = None @staticmethod def parser(item: dict) -> dict: item["id"] = item.pop("id") item["subscription_id"] = item.pop("subscription") return item @classmethod def from_raw_rss(cls, raw_rss_content: str, subscription: Subscription): parsed_rss = feedparser.parse(raw_rss_content) contents = [] for entry in parsed_rss.entries: # content_hash = hashlib.new("sha256") # content_hash.update(entry.get("description", "").encode()) # content_hash.hexdigest() data = { "id": -1, "subscription": subscription.id, "item_id": entry.get("id", ""), "item_guid": entry.get("guid", ""), "item_url": entry.get("link", ""), "item_title": entry.get("title", ""), "item_description": entry.get("description", "") } content = Content.from_dict(data) content.subscription = subscription contents.append(content) return contents @property def subscription(self) -> Subscription: return self._subscription @subscription.setter def subscription(self, subscription: Subscription): self._subscription = subscription