528 lines
17 KiB
Python
528 lines
17 KiB
Python
import re
|
|
import logging
|
|
import hashlib
|
|
import asyncio
|
|
from enum import Enum
|
|
from time import perf_counter
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime, timezone
|
|
from textwrap import shorten
|
|
|
|
import feedparser.parsers
|
|
import httpx
|
|
import discord
|
|
import rapidfuzz
|
|
import feedparser
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify
|
|
|
|
from utils import do_batch_job
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class DjangoDataModel(ABC):
|
|
|
|
@staticmethod
|
|
@abstractmethod
|
|
def parser(item: dict) -> dict:
|
|
return item
|
|
|
|
@classmethod
|
|
def from_list(cls, data: list[dict]) -> list:
|
|
return [cls(**cls.parser(item)) for item in data]
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict):
|
|
return cls(**cls.parser(data))
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class Server(DjangoDataModel):
|
|
id: int
|
|
name: str
|
|
icon_hash: str
|
|
is_bot_operational: bool
|
|
active: bool
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = int(item.pop("id"))
|
|
return item
|
|
|
|
|
|
class MatchingAlgorithm(Enum):
|
|
NONE = 0
|
|
ANY = 1
|
|
ALL = 2
|
|
LITERAL = 3
|
|
REGEX = 4
|
|
FUZZY = 5
|
|
AUTO = 6
|
|
|
|
@classmethod
|
|
def from_value(cls, value: int):
|
|
for member in cls:
|
|
if member.value == value:
|
|
return member
|
|
|
|
raise ValueError(f"No {cls.__class__.__name__} for value: {value}")
|
|
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ContentFilter(DjangoDataModel):
|
|
id: int
|
|
server_id: int
|
|
name: str
|
|
matching_pattern: str
|
|
matching_algorithm: MatchingAlgorithm
|
|
is_insensitive: bool
|
|
is_whitelist: bool
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = item.pop("id")
|
|
item["server_id"] = item.pop("server")
|
|
item["matching_pattern"] = item.pop("match")
|
|
item["matching_algorithm"] = MatchingAlgorithm.from_value(item.pop("matching_algorithm"))
|
|
return item
|
|
|
|
@property
|
|
def _regex_flags(self):
|
|
return re.IGNORECASE if self.is_insensitive else 0
|
|
|
|
@property
|
|
def cleaned_matching_pattern(self):
|
|
"""
|
|
Splits the pattern to individual keywords, getting rid of unnecessary
|
|
spaces and grouping quoted words together.
|
|
|
|
"""
|
|
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
|
normspace = re.compile(r"\s+").sub
|
|
return [
|
|
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
|
|
for t in findterms(self.matching_pattern)
|
|
]
|
|
|
|
def _match_any(self, matching_against: str):
|
|
for word in self.cleaned_matching_pattern:
|
|
if re.search(rf"\b{word}\b", matching_against, self._regex_flags):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _match_all(self, matching_against: str):
|
|
for word in self.cleaned_matching_pattern:
|
|
if not re.search(rf"\b{word}\b", matching_against, self._regex_flags):
|
|
return False
|
|
|
|
return True
|
|
|
|
def _match_literal(self, matching_against: str):
|
|
return bool(
|
|
re.search(
|
|
rf"\b{re.escape(self.matching_pattern)}\b",
|
|
matching_against,
|
|
self._regex_flags
|
|
)
|
|
)
|
|
|
|
def _match_regex(self, matching_against: str):
|
|
try:
|
|
return bool(re.search(
|
|
re.compile(self.matching_pattern, self._regex_flags),
|
|
matching_against
|
|
))
|
|
except re.error as exc:
|
|
log.error(f"Filter regex error: {exc}")
|
|
return False
|
|
|
|
def _match_fuzzy(self, matching_against: str):
|
|
matching_against = re.sub(r"[^\w\s]", "", matching_against)
|
|
matching_pattern = re.sub(r"[^\w\s]", "", self.matching_pattern)
|
|
if self.is_insensitive:
|
|
matching_against = matching_against.lower()
|
|
matching_pattern = matching_pattern.lower()
|
|
|
|
return rapidfuzz.fuzz.partial_ratio(
|
|
matching_against,
|
|
matching_pattern,
|
|
score_cutoff=90
|
|
)
|
|
|
|
def _get_algorithm_func(self):
|
|
match self.matching_algorithm:
|
|
case MatchingAlgorithm.NONE: return
|
|
case MatchingAlgorithm.ANY: return self._match_any
|
|
case MatchingAlgorithm.ALL: return self._match_all
|
|
case MatchingAlgorithm.LITERAL: return self._match_literal
|
|
case MatchingAlgorithm.REGEX: return self._match_regex
|
|
case MatchingAlgorithm.FUZZY: return self._match_fuzzy
|
|
case _: return
|
|
|
|
def matches(self, content) -> bool:
|
|
log.debug(f"applying filter: {self}")
|
|
|
|
if not self.matching_pattern.strip():
|
|
return False
|
|
|
|
if self.matching_algorithm == MatchingAlgorithm.ALL:
|
|
match_found = self._match_all(content.item_title + " " + content.item_description)
|
|
else:
|
|
algorithm_func = self._get_algorithm_func()
|
|
if not algorithm_func:
|
|
log.error(f"Bad algorithm function: {self.matching_algorithm}")
|
|
return False
|
|
|
|
match_found = algorithm_func(content.item_title) or algorithm_func(content.item_description)
|
|
|
|
log.debug(f"filter match found: {match_found}")
|
|
return not match_found if self.is_whitelist else match_found
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class MessageMutator(DjangoDataModel):
|
|
id: int
|
|
name: str
|
|
value: str
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = item.pop("id")
|
|
return item
|
|
|
|
@dataclass(slots=True)
|
|
class MessageStyle(DjangoDataModel):
|
|
id: int
|
|
server_id: int
|
|
name: str
|
|
colour: str
|
|
is_embed: bool
|
|
is_hyperlinked: bool
|
|
show_author: bool
|
|
show_timestamp: bool
|
|
show_images: bool
|
|
fetch_images: bool
|
|
title_mutator: dict | None
|
|
description_mutator: dict | None
|
|
auto_created: bool
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = int(item.pop("id"))
|
|
item["server_id"] = int(item.pop("server"))
|
|
item["title_mutator"] = item.pop("title_mutator_detail")
|
|
item["description_mutator"] = item.pop("description_mutator_detail")
|
|
return item
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DiscordChannel(DjangoDataModel):
|
|
id: int
|
|
name: str
|
|
is_nsfw: bool
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = int(item.pop("id"))
|
|
return item
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class Subscription(DjangoDataModel):
|
|
id: int
|
|
server_id: int
|
|
name: str
|
|
url: str
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
extra_notes: str
|
|
active: bool
|
|
publish_threshold: datetime
|
|
channels: list[DiscordChannel]
|
|
filters: list[ContentFilter]
|
|
message_style: MessageStyle
|
|
_server: Server | None = None
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = int(item.pop("id"))
|
|
item["server_id"] = int(item.pop("server"))
|
|
item["created_at"] = datetime.strptime(item.pop("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
item["updated_at"] = datetime.strptime(item.pop("updated_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
|
|
item["publish_threshold"] = datetime.strptime(item.pop("publish_threshold"), "%Y-%m-%dT%H:%M:%S%z")
|
|
item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
|
|
item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
|
|
item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
|
|
return item
|
|
|
|
@property
|
|
def server(self) -> Server:
|
|
return self._server
|
|
|
|
@server.setter
|
|
def server(self, server: server):
|
|
self._server = server
|
|
|
|
async def get_rss_content(self, client: httpx.AsyncClient) -> str:
|
|
start_time = perf_counter()
|
|
|
|
try:
|
|
response = await client.get(self.url)
|
|
response.raise_for_status()
|
|
except httpx.HTTPError as exc:
|
|
log.error("(%s) HTTP Exception for %s - %s", type(exc), exc.request.url, exc)
|
|
return
|
|
finally:
|
|
log.debug(f"Got rss content in {perf_counter() - start_time:.4f} seconds")
|
|
|
|
content_type = response.headers.get("Content-Type")
|
|
if not "text/xml" in content_type:
|
|
log.warning("Invalid 'Content-Type' header: %s (must contain 'text/xml')", content_type)
|
|
return
|
|
|
|
return response.text
|
|
|
|
async def get_discord_channels(self, bot) -> list[discord.TextChannel]:
|
|
start_time = perf_counter()
|
|
channels = []
|
|
|
|
for channel_detail in self.channels:
|
|
try:
|
|
channel = bot.get_channel(channel_detail.id)
|
|
channels.append(channel or await bot.fetch_channel(channel_detail.id))
|
|
except Exception as exc:
|
|
channel_reference = f"({channel_detail.name}, {channel_detail.id})"
|
|
server_reference = f"({self.server.name}, {self.server.id})"
|
|
log.debug(f"Failed to get channel {channel_reference} from {server_reference}: {exc}")
|
|
|
|
log.debug(f"Got channels in {perf_counter() - start_time:.4f} seconds")
|
|
return channels
|
|
|
|
def filter_entries(self, contents: list) -> tuple[list, list]:
|
|
log.debug(f"filtering entries for {self.name} in {self.server.name}")
|
|
|
|
valid_contents = []
|
|
invalid_contents = []
|
|
|
|
for content in contents:
|
|
log.debug(f"filtering: '{content.item_title}'")
|
|
if any(content_filter.matches(content) for content_filter in self.filters):
|
|
content.blocked = True
|
|
invalid_contents.append(content)
|
|
else:
|
|
valid_contents.append(content)
|
|
|
|
log.debug(f"filtered content: valid:{len(valid_contents)}, invalid:{len(invalid_contents)}")
|
|
return valid_contents, invalid_contents
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class Content(DjangoDataModel):
|
|
id: int
|
|
subscription_id: int
|
|
item_id: str
|
|
item_guid: str
|
|
item_url: str
|
|
item_title: str
|
|
item_description: str
|
|
item_content_hash: str
|
|
item_image_url: str | None
|
|
item_thumbnail_url: str | None
|
|
item_published: datetime | None
|
|
item_author: str
|
|
item_author_url: str | None
|
|
item_feed_title: str
|
|
item_feed_url: str
|
|
_subscription: Subscription | None = None
|
|
blocked: bool = False
|
|
|
|
@staticmethod
|
|
def parser(item: dict) -> dict:
|
|
item["id"] = item.pop("id")
|
|
item["subscription_id"] = item.pop("subscription")
|
|
return item
|
|
|
|
async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
|
|
log.debug(f"checking if {self.item_content_hash} exists via API")
|
|
params = {
|
|
"match_any": True, # allows any param to match, instead of needing all
|
|
"item_id": self.item_id,
|
|
"item_guid": self.item_guid,
|
|
"item_url": self.item_url,
|
|
"item_title": self.item_title,
|
|
"item_content_hash": self.item_content_hash,
|
|
"subscription": self.subscription_id
|
|
}
|
|
|
|
log.debug(f"params: {params}")
|
|
|
|
try:
|
|
response = await client.get(
|
|
url=url,
|
|
headers=headers,
|
|
params=params
|
|
)
|
|
response.raise_for_status()
|
|
except httpx.HTTPError as exc:
|
|
log.error(f"assuming not duplicate due to error: {exc}")
|
|
return False
|
|
|
|
return response.json().get("results", [])
|
|
|
|
def is_duplicate(self, other):
|
|
if not isinstance(other, Content):
|
|
raise ValueError(f"Expected Content, received {type(other)}")
|
|
|
|
other_details = other.duplicate_details
|
|
return any(
|
|
other_details.get(key) == value
|
|
for key, value in self.duplicate_details.items()
|
|
)
|
|
|
|
@property
|
|
def duplicate_details(self):
|
|
keys = [
|
|
"item_id",
|
|
"item_guid",
|
|
"item_url",
|
|
"item_title",
|
|
"item_content_hash"
|
|
]
|
|
data = asdict(self)
|
|
return { key: data[key] for key in keys }
|
|
|
|
async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
|
|
log.debug(f"saving content {self.item_content_hash}")
|
|
|
|
data = asdict(self)
|
|
data.pop("id")
|
|
data["subscription"] = data.pop("subscription_id")
|
|
item_published = data.pop("item_published")
|
|
data["item_published"] = item_published.strftime("%Y-%m-%d") if item_published else None
|
|
data.pop("_subscription")
|
|
|
|
response = await client.post(
|
|
url=base_url + "content/",
|
|
headers=headers,
|
|
data=data
|
|
)
|
|
response.raise_for_status()
|
|
log.debug(f"save success for {self.item_content_hash}")
|
|
|
|
@classmethod
|
|
async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
|
|
style = subscription.message_style
|
|
parsed_rss = feedparser.parse(rss)
|
|
contents = []
|
|
|
|
async def create_content(entry: feedparser.FeedParserDict):
|
|
published = entry.get("published_parsed")
|
|
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
|
|
|
|
if published < subscription.publish_threshold:
|
|
log.debug("skipping due to publish threshold")
|
|
return
|
|
|
|
content_hash = hashlib.new("sha256")
|
|
content_hash.update(entry.get("description", "").encode())
|
|
|
|
item_url = entry.get("link", "")
|
|
item_image_url = entry.get("media_thumbnail", [{}])[0].get("url")
|
|
if style.fetch_images:
|
|
item_image_url = await cls.get_image_url(item_url, client)
|
|
|
|
content = Content.from_dict({
|
|
"id": -1,
|
|
"subscription": subscription.id,
|
|
"item_id": entry.get("id", ""),
|
|
"item_guid": entry.get("guid", ""),
|
|
"item_url": item_url,
|
|
"item_title": entry.get("title", ""),
|
|
"item_description": entry.get("description", ""),
|
|
"item_content_hash": content_hash.hexdigest(),
|
|
"item_image_url": item_image_url,
|
|
"item_thumbnail_url": parsed_rss.feed.image.href or None,
|
|
"item_published": published,
|
|
"item_author": entry.get("author", ""),
|
|
"item_author_url": entry.get("author_detail", {}).get("href"),
|
|
"item_feed_title": parsed_rss.get("feed", {}).get("title"),
|
|
"item_feed_url": parsed_rss.get("feed", {}).get("link")
|
|
})
|
|
|
|
# Weed out duplicates
|
|
log.debug("weeding out duplicates")
|
|
if any(content.is_duplicate(other) for other in contents):
|
|
log.debug("found duplicate while loading rss data")
|
|
return
|
|
|
|
content.subscription = subscription
|
|
contents.append(content)
|
|
|
|
await do_batch_job(parsed_rss.entries, create_content, 15)
|
|
contents.sort(key=lambda k: k.item_published)
|
|
return contents
|
|
|
|
@staticmethod
|
|
async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
|
|
log.debug("Fetching image url")
|
|
|
|
try:
|
|
response = await client.get(url, timeout=15)
|
|
except httpx.HTTPError:
|
|
return None
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
image_element = soup.select_one("meta[property='og:image']")
|
|
if not image_element:
|
|
return None
|
|
|
|
return image_element.get("content")
|
|
|
|
@property
|
|
def subscription(self) -> Subscription:
|
|
return self._subscription
|
|
|
|
@subscription.setter
|
|
def subscription(self, subscription: Subscription):
|
|
self._subscription = subscription
|
|
|
|
@property
|
|
def embed(self):
|
|
colour=discord.Colour.from_str(
|
|
f"#{self.subscription.message_style.colour}"
|
|
)
|
|
|
|
# ensure content fits within character limits
|
|
title = shorten(markdownify(self.item_title, strip=("img", "a")), 256)
|
|
description = shorten(markdownify(self.item_description, strip=("img",)), 4096)
|
|
author = self.item_author or self.item_feed_title
|
|
|
|
combined_length = len(title) + len(description) + (len(author) * 2)
|
|
cutoff = combined_length - 6000
|
|
description = shorten(description, cutoff) if cutoff > 0 else description
|
|
|
|
embed = discord.Embed(
|
|
title=title,
|
|
description=description,
|
|
url=self.item_url,
|
|
colour=colour,
|
|
timestamp=self.item_published
|
|
)
|
|
|
|
embed.set_image(url=self.item_image_url)
|
|
embed.set_thumbnail(url=self.item_thumbnail_url)
|
|
embed.set_author(
|
|
name=author,
|
|
url=self.item_author_url or self.item_feed_url
|
|
)
|
|
embed.set_footer(text=self.subscription.name)
|
|
|
|
log.debug(f"created embed: {embed.to_dict()}")
|
|
|
|
return embed
|