PYRSS-Bot/src/models.py

528 lines
17 KiB
Python

import re
import logging
import hashlib
import asyncio
from enum import Enum
from time import perf_counter
from abc import ABC, abstractmethod
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from textwrap import shorten
import feedparser.parsers
import httpx
import discord
import rapidfuzz
import feedparser
from bs4 import BeautifulSoup
from markdownify import markdownify
from utils import do_batch_job
log = logging.getLogger(__name__)
@dataclass
class DjangoDataModel(ABC):
@staticmethod
@abstractmethod
def parser(item: dict) -> dict:
return item
@classmethod
def from_list(cls, data: list[dict]) -> list:
return [cls(**cls.parser(item)) for item in data]
@classmethod
def from_dict(cls, data: dict):
return cls(**cls.parser(data))
@dataclass(slots=True)
class Server(DjangoDataModel):
id: int
name: str
icon_hash: str
is_bot_operational: bool
active: bool
@staticmethod
def parser(item: dict) -> dict:
item["id"] = int(item.pop("id"))
return item
class MatchingAlgorithm(Enum):
NONE = 0
ANY = 1
ALL = 2
LITERAL = 3
REGEX = 4
FUZZY = 5
AUTO = 6
@classmethod
def from_value(cls, value: int):
for member in cls:
if member.value == value:
return member
raise ValueError(f"No {cls.__class__.__name__} for value: {value}")
@dataclass(slots=True)
class ContentFilter(DjangoDataModel):
id: int
server_id: int
name: str
matching_pattern: str
matching_algorithm: MatchingAlgorithm
is_insensitive: bool
is_whitelist: bool
@staticmethod
def parser(item: dict) -> dict:
item["id"] = item.pop("id")
item["server_id"] = item.pop("server")
item["matching_pattern"] = item.pop("match")
item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm"))
return item
@property
def _regex_flags(self):
return re.IGNORECASE if self.is_insensitive else 0
@property
def cleaned_matching_pattern(self):
"""
Splits the pattern to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(self.matching_pattern)
]
def _match_any(self, matching_against: str):
for word in self.cleaned_matching_pattern:
if re.search(rf"\b{word}\b", matching_against, self._regex_flags):
return True
return False
def _match_all(self, matching_against: str):
for word in self.cleaned_matching_pattern:
if not re.search(rf"\b{word}\b", matching_against, self._regex_flags):
return False
return True
def _match_literal(self, matching_against: str):
return bool(
re.search(
rf"\b{re.escape(self.matching_pattern)}\b",
matching_against,
self._regex_flags
)
)
def _match_regex(self, matching_against: str):
try:
return bool(re.search(
re.compile(self.matching_pattern, self._regex_flags),
matching_against
))
except re.error as exc:
log.error(f"Filter regex error: {exc}")
return False
def _match_fuzzy(self, matching_against: str):
matching_against = re.sub(r"[^\w\s]", "", matching_against)
matching_pattern = re.sub(r"[^\w\s]", "", self.matching_pattern)
if self.is_insensitive:
matching_against = matching_against.lower()
matching_pattern = matching_pattern.lower()
return rapidfuzz.fuzz.partial_ratio(
matching_against,
matching_pattern,
score_cutoff=90
)
def _get_algorithm_func(self):
match self.matching_algorithm:
case MatchingAlgorithm.NONE: return
case MatchingAlgorithm.ANY: return self._match_any
case MatchingAlgorithm.ALL: return self._match_all
case MatchingAlgorithm.LITERAL: return self._match_literal
case MatchingAlgorithm.REGEX: return self._match_regex
case MatchingAlgorithm.FUZZY: return self._match_fuzzy
case _: return
def matches(self, content) -> bool:
log.debug(f"applying filter: {self}")
if not self.matching_pattern.strip():
return False
if self.matching_algorithm == MatchingAlgorithm.ALL:
match_found = self._match_all(content.item_title + " " + content.item_description)
else:
algorithm_func = self._get_algorithm_func()
if not algorithm_func:
log.error(f"Bad algorithm function: {self.matching_algorithm}")
return False
match_found = algorithm_func(content.item_title) or algorithm_func(content.item_description)
log.debug(f"filter match found: {match_found}")
return not match_found if self.is_whitelist else match_found
@dataclass(slots=True)
class MessageMutator(DjangoDataModel):
id: int
name: str
value: str
@staticmethod
def parser(item: dict) -> dict:
item["id"] = item.pop("id")
return item
@dataclass(slots=True)
class MessageStyle(DjangoDataModel):
id: int
server_id: int
name: str
colour: str
is_embed: bool
is_hyperlinked: bool
show_author: bool
show_timestamp: bool
show_images: bool
fetch_images: bool
title_mutator: dict | None
description_mutator: dict | None
auto_created: bool
@staticmethod
def parser(item: dict) -> dict:
item["id"] = int(item.pop("id"))
item["server_id"] = int(item.pop("server"))
item["title_mutator"] = item.pop("title_mutator_detail")
item["description_mutator"] = item.pop("description_mutator_detail")
return item
@dataclass(slots=True)
class DiscordChannel(DjangoDataModel):
id: int
name: str
is_nsfw: bool
@staticmethod
def parser(item: dict) -> dict:
item["id"] = int(item.pop("id"))
return item
@dataclass(slots=True)
class Subscription(DjangoDataModel):
id: int
server_id: int
name: str
url: str
created_at: datetime
updated_at: datetime
extra_notes: str
active: bool
publish_threshold: datetime
channels: list[DiscordChannel]
filters: list[ContentFilter]
message_style: MessageStyle
_server: Server | None = None
@staticmethod
def parser(item: dict) -> dict:
item["id"] = int(item.pop("id"))
item["server_id"] = int(item.pop("server"))
item["created_at"] = datetime.strptime(item.pop("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
item["updated_at"] = datetime.strptime(item.pop("updated_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
item["publish_threshold"] = datetime.strptime(item.pop("publish_threshold"), "%Y-%m-%dT%H:%M:%S%z")
item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
return item
@property
def server(self) -> Server:
return self._server
@server.setter
def server(self, server: server):
self._server = server
async def get_rss_content(self, client: httpx.AsyncClient) -> str:
start_time = perf_counter()
try:
response = await client.get(self.url)
response.raise_for_status()
except httpx.HTTPError as exc:
log.error("(%s) HTTP Exception for %s - %s", type(exc), exc.request.url, exc)
return
finally:
log.debug(f"Got rss content in {perf_counter() - start_time:.4f} seconds")
content_type = response.headers.get("Content-Type")
if not "text/xml" in content_type:
log.warning("Invalid 'Content-Type' header: %s (must contain 'text/xml')", content_type)
return
return response.text
async def get_discord_channels(self, bot) -> list[discord.TextChannel]:
start_time = perf_counter()
channels = []
for channel_detail in self.channels:
try:
channel = bot.get_channel(channel_detail.id)
channels.append(channel or await bot.fetch_channel(channel_detail.id))
except Exception as exc:
channel_reference = f"({channel_detail.name}, {channel_detail.id})"
server_reference = f"({self.server.name}, {self.server.id})"
log.debug(f"Failed to get channel {channel_reference} from {server_reference}: {exc}")
log.debug(f"Got channels in {perf_counter() - start_time:.4f} seconds")
return channels
def filter_entries(self, contents: list) -> tuple[list, list]:
log.debug(f"filtering entries for {self.name} in {self.server.name}")
valid_contents = []
invalid_contents = []
for content in contents:
log.debug(f"filtering: '{content.item_title}'")
if any(content_filter.matches(content) for content_filter in self.filters):
content.blocked = True
invalid_contents.append(content)
else:
valid_contents.append(content)
log.debug(f"filtered content: valid:{len(valid_contents)}, invalid:{len(invalid_contents)}")
return valid_contents, invalid_contents
@dataclass(slots=True)
class Content(DjangoDataModel):
id: int
subscription_id: int
item_id: str
item_guid: str
item_url: str
item_title: str
item_description: str
item_content_hash: str
item_image_url: str | None
item_thumbnail_url: str | None
item_published: datetime | None
item_author: str
item_author_url: str | None
item_feed_title: str
item_feed_url: str
_subscription: Subscription | None = None
blocked: bool = False
@staticmethod
def parser(item: dict) -> dict:
item["id"] = item.pop("id")
item["subscription_id"] = item.pop("subscription")
return item
async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
log.debug(f"checking if {self.item_content_hash} exists via API")
params = {
"match_any": True, # allows any param to match, instead of needing all
"item_id": self.item_id,
"item_guid": self.item_guid,
"item_url": self.item_url,
"item_title": self.item_title,
"item_content_hash": self.item_content_hash,
"subscription": self.subscription_id
}
log.debug(f"params: {params}")
try:
response = await client.get(
url=url,
headers=headers,
params=params
)
response.raise_for_status()
except httpx.HTTPError as exc:
log.error(f"assuming not duplicate due to error: {exc}")
return False
return response.json().get("results", [])
def is_duplicate(self, other):
if not isinstance(other, Content):
raise ValueError(f"Expected Content, received {type(other)}")
other_details = other.duplicate_details
return any(
other_details.get(key) == value
for key, value in self.duplicate_details.items()
)
@property
def duplicate_details(self):
keys = [
"item_id",
"item_guid",
"item_url",
"item_title",
"item_content_hash"
]
data = asdict(self)
return { key: data[key] for key in keys }
async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
log.debug(f"saving content {self.item_content_hash}")
data = asdict(self)
data.pop("id")
data["subscription"] = data.pop("subscription_id")
item_published = data.pop("item_published")
data["item_published"] = item_published.strftime("%Y-%m-%d") if item_published else None
data.pop("_subscription")
response = await client.post(
url=base_url + "content/",
headers=headers,
data=data
)
response.raise_for_status()
log.debug(f"save success for {self.item_content_hash}")
@classmethod
async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
style = subscription.message_style
parsed_rss = feedparser.parse(rss)
contents = []
async def create_content(entry: feedparser.FeedParserDict):
published = entry.get("published_parsed")
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
if published < subscription.publish_threshold:
log.debug("skipping due to publish threshold")
return
content_hash = hashlib.new("sha256")
content_hash.update(entry.get("description", "").encode())
item_url = entry.get("link", "")
item_image_url = entry.get("media_thumbnail", [{}])[0].get("url")
if style.fetch_images:
item_image_url = await cls.get_image_url(item_url, client)
content = Content.from_dict({
"id": -1,
"subscription": subscription.id,
"item_id": entry.get("id", ""),
"item_guid": entry.get("guid", ""),
"item_url": item_url,
"item_title": entry.get("title", ""),
"item_description": entry.get("description", ""),
"item_content_hash": content_hash.hexdigest(),
"item_image_url": item_image_url,
"item_thumbnail_url": parsed_rss.feed.image.href or None,
"item_published": published,
"item_author": entry.get("author", ""),
"item_author_url": entry.get("author_detail", {}).get("href"),
"item_feed_title": parsed_rss.get("feed", {}).get("title"),
"item_feed_url": parsed_rss.get("feed", {}).get("link")
})
# Weed out duplicates
log.debug("weeding out duplicates")
if any(content.is_duplicate(other) for other in contents):
log.debug("found duplicate while loading rss data")
return
content.subscription = subscription
contents.append(content)
await do_batch_job(parsed_rss.entries, create_content, 15)
contents.sort(key=lambda k: k.item_published)
return contents
@staticmethod
async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
log.debug("Fetching image url")
try:
response = await client.get(url, timeout=15)
except httpx.HTTPError:
return None
soup = BeautifulSoup(response.text, "html.parser")
image_element = soup.select_one("meta[property='og:image']")
if not image_element:
return None
return image_element.get("content")
@property
def subscription(self) -> Subscription:
return self._subscription
@subscription.setter
def subscription(self, subscription: Subscription):
self._subscription = subscription
@property
def embed(self):
colour=discord.Colour.from_str(
f"#{self.subscription.message_style.colour}"
)
# ensure content fits within character limits
title = shorten(markdownify(self.item_title, strip=("img", "a")), 256)
description = shorten(markdownify(self.item_description, strip=("img",)), 4096)
author = self.item_author or self.item_feed_title
combined_length = len(title) + len(description) + (len(author) * 2)
cutoff = combined_length - 6000
description = shorten(description, cutoff) if cutoff > 0 else description
embed = discord.Embed(
title=title,
description=description,
url=self.item_url,
colour=colour,
timestamp=self.item_published
)
embed.set_image(url=self.item_image_url)
embed.set_thumbnail(url=self.item_thumbnail_url)
embed.set_author(
name=author,
url=self.item_author_url or self.item_feed_url
)
embed.set_footer(text=self.subscription.name)
log.debug(f"created embed: {embed.to_dict()}")
return embed