PYRSS-Bot/src/models.py

import re
import logging
import hashlib
import asyncio
from enum import Enum
from time import perf_counter
from abc import ABC, abstractmethod
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from textwrap import shorten

import feedparser.parsers
import httpx
import discord
import rapidfuzz
import feedparser
from bs4 import BeautifulSoup
from markdownify import markdownify

from utils import do_batch_job

log = logging.getLogger(__name__)


@dataclass
class DjangoDataModel(ABC):

    @staticmethod
    @abstractmethod
    def parser(item: dict) -> dict:
        return item

    @classmethod
    def from_list(cls, data: list[dict]) -> list:
        return [cls(**cls.parser(item)) for item in data]

    @classmethod
    def from_dict(cls, data: dict):
        return cls(**cls.parser(data))


@dataclass(slots=True)
class Server(DjangoDataModel):
    id: int
    name: str
    icon_hash: str
    is_bot_operational: bool
    active: bool

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = int(item.pop("id"))
        return item


class MatchingAlgorithm(Enum):
    NONE = 0
    ANY = 1
    ALL = 2
    LITERAL = 3
    REGEX = 4
    FUZZY = 5
    AUTO = 6

    @classmethod
    def from_value(cls, value: int):
        for member in cls:
            if member.value == value:
                return member

        raise ValueError(f"No {cls.__class__.__name__} for value: {value}")


@dataclass(slots=True)
class ContentFilter(DjangoDataModel):
    id: int
    server_id: int
    name: str
    matching_pattern: str
    matching_algorithm: MatchingAlgorithm
    is_insensitive: bool
    is_whitelist: bool

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = item.pop("id")
        item["server_id"] = item.pop("server")
        item["matching_pattern"] = item.pop("match")
        item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm"))
        return item

    @property
    def _regex_flags(self):
        return re.IGNORECASE if self.is_insensitive else 0

    @property
    def cleaned_matching_pattern(self):
        """
        Splits the pattern to individual keywords, getting rid of unnecessary
        spaces and grouping quoted words together.

        """
        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
        normspace = re.compile(r"\s+").sub
        return [
            re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
            for t in findterms(self.matching_pattern)
        ]

    def _match_any(self, matching_against: str):
        for word in self.cleaned_matching_pattern:
            if re.search(rf"\b{word}\b", matching_against, self._regex_flags):
                return True

        return False

    def _match_all(self, matching_against: str):
        for word in self.cleaned_matching_pattern:
            if not re.search(rf"\b{word}\b", matching_against, self._regex_flags):
                return False

        return True

    def _match_literal(self, matching_against: str):
        return bool(
            re.search(
                rf"\b{re.escape(self.matching_pattern)}\b",
                matching_against,
                self._regex_flags
            )
        )

    def _match_regex(self, matching_against: str):
        try:
            return bool(re.search(
                re.compile(self.matching_pattern, self._regex_flags),
                matching_against
            ))
        except re.error as exc:
            log.error(f"Filter regex error: {exc}")
            return False

    def _match_fuzzy(self, matching_against: str):
        matching_against = re.sub(r"[^\w\s]", "", matching_against)
        matching_pattern = re.sub(r"[^\w\s]", "", self.matching_pattern)
        if self.is_insensitive:
            matching_against = matching_against.lower()
            matching_pattern = matching_pattern.lower()

        return rapidfuzz.fuzz.partial_ratio(
            matching_against,
            matching_pattern,
            score_cutoff=90
        )

    def _get_algorithm_func(self):
        match self.matching_algorithm:
            case MatchingAlgorithm.NONE: return
            case MatchingAlgorithm.ANY: return self._match_any
            case MatchingAlgorithm.ALL: return self._match_all
            case MatchingAlgorithm.LITERAL: return self._match_literal
            case MatchingAlgorithm.REGEX: return self._match_regex
            case MatchingAlgorithm.FUZZY: return self._match_fuzzy
            case _: return

    def matches(self, content) -> bool:
        log.debug(f"applying filter: {self}")

        if not self.matching_pattern.strip():
            return False

        if self.matching_algorithm == MatchingAlgorithm.ALL:
            match_found = self._match_all(content.item_title + " " + content.item_description)
        else:
            algorithm_func = self._get_algorithm_func()
            if not algorithm_func:
                log.error(f"Bad algorithm function: {self.matching_algorithm}")
                return False

            match_found = algorithm_func(content.item_title) or algorithm_func(content.item_description)

        log.debug(f"filter match found: {match_found}")
        return not match_found if self.is_whitelist else match_found


@dataclass(slots=True)
class MessageMutator(DjangoDataModel):
    id: int
    name: str
    value: str

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = item.pop("id")
        return item

@dataclass(slots=True)
class MessageStyle(DjangoDataModel):
    id: int
    server_id: int
    name: str
    colour: str
    is_embed: bool
    is_hyperlinked: bool
    show_author: bool
    show_timestamp: bool
    show_images: bool
    fetch_images: bool
    title_mutator: dict | None
    description_mutator: dict | None
    auto_created: bool

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = int(item.pop("id"))
        item["server_id"] = int(item.pop("server"))
        item["title_mutator"] = item.pop("title_mutator_detail")
        item["description_mutator"] = item.pop("description_mutator_detail")
        return item


@dataclass(slots=True)
class DiscordChannel(DjangoDataModel):
    id: int
    name: str
    is_nsfw: bool

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = int(item.pop("id"))
        return item


@dataclass(slots=True)
class Subscription(DjangoDataModel):
    id: int
    server_id: int
    name: str
    url: str
    created_at: datetime
    updated_at: datetime
    extra_notes: str
    active: bool
    publish_threshold: datetime
    channels: list[DiscordChannel]
    filters: list[ContentFilter]
    message_style: MessageStyle
    _server: Server | None = None

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = int(item.pop("id"))
        item["server_id"] = int(item.pop("server"))
        item["created_at"] = datetime.strptime(item.pop("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
        item["updated_at"] = datetime.strptime(item.pop("updated_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
        item["publish_threshold"] = datetime.strptime(item.pop("publish_threshold"), "%Y-%m-%dT%H:%M:%S%z")
        item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
        item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
        item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
        return item

    @property
    def server(self) -> Server:
        return self._server

    @server.setter
    def server(self, server: server):
        self._server = server

    async def get_rss_content(self, client: httpx.AsyncClient) -> str:
        start_time = perf_counter()

        try:
            response = await client.get(self.url)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            log.error("(%s) HTTP Exception for %s - %s", type(exc), exc.request.url, exc)
            return
        finally:
            log.debug(f"Got rss content in {perf_counter() - start_time:.4f} seconds")

        content_type = response.headers.get("Content-Type")
        if not "text/xml" in content_type:
            log.warning("Invalid 'Content-Type' header: %s (must contain 'text/xml')", content_type)
            return

        return response.text

    async def get_discord_channels(self, bot) -> list[discord.TextChannel]:
        start_time = perf_counter()
        channels = []

        for channel_detail in self.channels:
            try:
                channel = bot.get_channel(channel_detail.id)
                channels.append(channel or await bot.fetch_channel(channel_detail.id))
            except Exception as exc:
                channel_reference = f"({channel_detail.name}, {channel_detail.id})"
                server_reference = f"({self.server.name}, {self.server.id})"
                log.debug(f"Failed to get channel {channel_reference} from {server_reference}: {exc}")

        log.debug(f"Got channels in {perf_counter() - start_time:.4f} seconds")
        return channels

    def filter_entries(self, contents: list) -> tuple[list, list]:
        log.debug(f"filtering entries for {self.name} in {self.server.name}")

        valid_contents = []
        invalid_contents = []

        for content in contents:
            log.debug(f"filtering: '{content.item_title}'")
            if any(content_filter.matches(content) for content_filter in self.filters):
                content.blocked = True
                invalid_contents.append(content)
            else:
                valid_contents.append(content)

        log.debug(f"filtered content: valid:{len(valid_contents)}, invalid:{len(invalid_contents)}")
        return valid_contents, invalid_contents


@dataclass(slots=True)
class Content(DjangoDataModel):
    id: int
    subscription_id: int
    item_id: str
    item_guid: str
    item_url: str
    item_title: str
    item_description: str
    item_content_hash: str
    item_image_url: str | None
    item_thumbnail_url: str | None
    item_published: datetime | None
    item_author: str
    item_author_url: str | None
    item_feed_title: str
    item_feed_url: str
    _subscription: Subscription | None = None
    blocked: bool = False

    @staticmethod
    def parser(item: dict) -> dict:
        item["id"] = item.pop("id")
        item["subscription_id"] = item.pop("subscription")
        return item

    async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
        log.debug(f"checking if {self.item_content_hash} exists via API")
        params = {
            "match_any": True,  # allows any param to match, instead of needing all
            "item_id": self.item_id,
            "item_guid": self.item_guid,
            "item_url": self.item_url,
            "item_title": self.item_title,
            "item_content_hash": self.item_content_hash,
            "subscription": self.subscription_id
        }

        log.debug(f"params: {params}")

        try:
            response = await client.get(
                url=url,
                headers=headers,
                params=params
            )
            response.raise_for_status()
        except httpx.HTTPError as exc:
            log.error(f"assuming not duplicate due to error: {exc}")
            return False

        return response.json().get("results", [])

    def is_duplicate(self, other):
        if not isinstance(other, Content):
            raise ValueError(f"Expected Content, received {type(other)}")

        other_details = other.duplicate_details
        return any(
            other_details.get(key) == value
            for key, value in self.duplicate_details.items()
        )

    @property
    def duplicate_details(self):
        keys = [
            "item_id",
            "item_guid",
            "item_url",
            "item_title",
            "item_content_hash"
        ]
        data = asdict(self)
        return { key: data[key] for key in keys }

    async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
        log.debug(f"saving content {self.item_content_hash}")

        data = asdict(self)
        data.pop("id")
        data["subscription"] = data.pop("subscription_id")
        item_published = data.pop("item_published")
        data["item_published"] = item_published.strftime("%Y-%m-%d") if item_published else None
        data.pop("_subscription")

        response = await client.post(
            url=base_url + "content/",
            headers=headers,
            data=data
        )
        response.raise_for_status()
        log.debug(f"save success for {self.item_content_hash}")

    @classmethod
    async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
        style = subscription.message_style
        parsed_rss = feedparser.parse(rss)
        contents = []

        async def create_content(entry: feedparser.FeedParserDict):
            published = entry.get("published_parsed")
            published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)

            if published < subscription.publish_threshold:
                log.debug("skipping due to publish threshold")
                return

            content_hash = hashlib.new("sha256")
            content_hash.update(entry.get("description", "").encode())

            item_url = entry.get("link", "")
            item_image_url = entry.get("media_thumbnail", [{}])[0].get("url")
            if style.fetch_images:
                item_image_url = await cls.get_image_url(item_url, client)

            content = Content.from_dict({
                "id": -1,
                "subscription": subscription.id,
                "item_id": entry.get("id", ""),
                "item_guid": entry.get("guid", ""),
                "item_url": item_url,
                "item_title": entry.get("title", ""),
                "item_description": entry.get("description", ""),
                "item_content_hash": content_hash.hexdigest(),
                "item_image_url": item_image_url,
                "item_thumbnail_url": parsed_rss.feed.image.href or None,
                "item_published": published,
                "item_author": entry.get("author", ""),
                "item_author_url": entry.get("author_detail", {}).get("href"),
                "item_feed_title": parsed_rss.get("feed", {}).get("title"),
                "item_feed_url": parsed_rss.get("feed", {}).get("link")
            })

            # Weed out duplicates
            log.debug("weeding out duplicates")
            if any(content.is_duplicate(other) for other in contents):
                log.debug("found duplicate while loading rss data")
                return

            content.subscription = subscription
            contents.append(content)

        await do_batch_job(parsed_rss.entries, create_content, 15)
        contents.sort(key=lambda k: k.item_published)
        return contents

    @staticmethod
    async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
        log.debug("Fetching image url")

        try:
            response = await client.get(url, timeout=15)
        except httpx.HTTPError:
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        image_element = soup.select_one("meta[property='og:image']")
        if not image_element:
            return None

        return image_element.get("content")

    @property
    def subscription(self) -> Subscription:
        return self._subscription

    @subscription.setter
    def subscription(self, subscription: Subscription):
        self._subscription = subscription

    @property
    def embed(self):
        colour=discord.Colour.from_str(
            f"#{self.subscription.message_style.colour}"
        )

        # ensure content fits within character limits
        title = shorten(markdownify(self.item_title, strip=("img", "a")), 256)
        description = shorten(markdownify(self.item_description, strip=("img",)), 4096)
        author = self.item_author or self.item_feed_title

        combined_length = len(title) + len(description) + (len(author) * 2)
        cutoff = combined_length - 6000
        description = shorten(description, cutoff) if cutoff > 0 else description

        embed = discord.Embed(
            title=title,
            description=description,
            url=self.item_url,
            colour=colour,
            timestamp=self.item_published
        )

        embed.set_image(url=self.item_image_url)
        embed.set_thumbnail(url=self.item_thumbnail_url)
        embed.set_author(
            name=author,
            url=self.item_author_url or self.item_feed_url
        )
        embed.set_footer(text=self.subscription.name)

        log.debug(f"created embed: {embed.to_dict()}")

        return embed