diff --git a/src/extensions/tasks.py b/src/extensions/tasks.py index 2434f13..9eea275 100644 --- a/src/extensions/tasks.py +++ b/src/extensions/tasks.py @@ -109,7 +109,7 @@ class TaskCog(commands.Cog): await do_batch_job(servers, self.process_server, 10) end_time = perf_counter() - log.debug(f"completed task in {end_time - start_time:.4f} seconds") + log.info(f"completed task in {end_time - start_time:.4f} seconds") async def iterate_pages(self, url: str, params: dict={}): @@ -147,6 +147,28 @@ class TaskCog(commands.Cog): return models.Subscription.from_list(subscriptions) + async def get_contents(self, subscription: models.Subscription, raw_rss_content: dict): + contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client) + duplicate_contents = [] + + async def check_duplicate_content(content: models.Content): + exists = await content.exists_via_api( + url=self.api_base_url + "content/", + headers=self.api_headers, + client=self.client + ) + + if exists: + log.debug(f"Removing duplicate {content}") + duplicate_contents.append(content) + + await do_batch_job(contents, check_duplicate_content, 15) + + for duplicate in duplicate_contents: + contents.remove(duplicate) + + return contents + async def process_server(self, server: models.Server): log.debug(f"processing server: {server.name}") start_time = perf_counter() @@ -168,42 +190,17 @@ class TaskCog(commands.Cog): if not raw_rss_content: return - contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client) - - async def check_duplicate_content(content: models.Content): - params = { - "match_any": True, # allows any param to match, instead of needing all - "item_id": content.item_id, - "item_guid": content.item_guid, - "item_url": content.item_url, - "item_title": content.item_title, - "item_content_hash": content.item_content_hash, - "subscription": content.subscription_id - } - - try: - response = await self.client.get( - self.api_base_url + f"content/", - headers=self.api_headers, - params=params - ) - response.raise_for_status() - - if len(response.json().get("results", [])): - log.debug("found duplicate") - contents.remove(content) - except httpx.HTTPError as exc: - log.error(f"assuming not duplicate {exc}") - - # clear duplicate content - log.debug(f"checking for duplicates (count: {len(contents)})") - await do_batch_job(contents, check_duplicate_content, 15) - log.debug(f"finished looking for duplicates (count: {len(contents)})") + contents = await self.get_contents(subscription, raw_rss_content) + if not contents: + log.debug("no contents to process") + return channels = await subscription.get_discord_channels(self.bot) valid_contents, invalid_contents = subscription.filter_entries(contents) async def send_content(channel: discord.TextChannel): + # BUG: I believe there are duplicate embeds here + # discord only shows 1 when urls are matching, but merges images from both into the 1 embeds = [content.embed for content in valid_contents] batch_size = 10 for i in range(0, len(embeds), batch_size): diff --git a/src/models.py b/src/models.py index bcddb33..2dfb838 100644 --- a/src/models.py +++ b/src/models.py @@ -217,18 +217,6 @@ class MessageStyle(DjangoDataModel): return item -@dataclass(slots=True) -class UniqueContentRule(DjangoDataModel): - id: int - name: str - value: str - - @staticmethod - def parser(item: dict) -> dict: - item["id"] = int(item.pop("id")) - return item - - @dataclass(slots=True) class DiscordChannel(DjangoDataModel): id: int @@ -255,7 +243,6 @@ class Subscription(DjangoDataModel): channels: list[DiscordChannel] filters: list[ContentFilter] message_style: MessageStyle - unique_rules: UniqueContentRule _server: Server | None = None @staticmethod @@ -268,7 +255,6 @@ class Subscription(DjangoDataModel): item["channels"] = DiscordChannel.from_list(item.pop("channels_detail")) item["filters"] = ContentFilter.from_list(item.pop("filters_detail")) item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail")) - item["unique_rules"] = UniqueContentRule.from_list(item.pop("unique_rules_detail")) return item @property @@ -358,6 +344,53 @@ class Content(DjangoDataModel): item["subscription_id"] = item.pop("subscription") return item + async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient): + log.debug(f"checking if {self.item_content_hash} exists via API") + params = { + "match_any": True, # allows any param to match, instead of needing all + "item_id": self.item_id, + "item_guid": self.item_guid, + "item_url": self.item_url, + "item_title": self.item_title, + "item_content_hash": self.item_content_hash, + "subscription": self.subscription_id + } + + try: + response = await client.get( + url=url, + headers=headers, + params=params + ) + response.raise_for_status() + except httpx.HTTPError as exc: + log.error(f"assuming not duplicate due to error: {exc}") + return False + + return response.json().get("results", []) + + def is_duplicate(self, other): + if not isinstance(other, Content): + raise ValueError(f"Expected Content, received {type(other)}") + + other_details = other.duplicate_details + return any( + other_details.get(key) == value + for key, value in self.duplicate_details.items() + ) + + @property + def duplicate_details(self): + keys = [ + "item_id", + "item_guid", + "item_url", + "item_title", + "item_content_hash" + ] + data = asdict(self) + return { key: data[key] for key in keys } + async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict): log.debug(f"saving content {self.item_content_hash}") @@ -373,8 +406,8 @@ class Content(DjangoDataModel): headers=headers, data=data ) - log.debug(response.text) response.raise_for_status() + log.debug(f"save success for {self.item_content_hash}") @classmethod async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient): @@ -383,6 +416,13 @@ class Content(DjangoDataModel): contents = [] async def create_content(entry: feedparser.FeedParserDict): + published = entry.get("published_parsed") + published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc) + + if published < subscription.publish_threshold: + log.debug("skipping due to publish threshold") + return + content_hash = hashlib.new("sha256") content_hash.update(entry.get("description", "").encode()) @@ -391,9 +431,6 @@ class Content(DjangoDataModel): if style.fetch_images: item_image_url = await cls.get_image_url(item_url, client) - published = entry.get("published_parsed") - published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc) - content = Content.from_dict({ "id": -1, "subscription": subscription.id, @@ -412,6 +449,12 @@ class Content(DjangoDataModel): "item_feed_url": parsed_rss.get("feed", {}).get("link") }) + # Weed out duplicates + log.debug("weeding out duplicates") + if any(content.is_duplicate(other) for other in contents): + log.debug("found duplicate while loading rss data") + return + content.subscription = subscription contents.append(content) @@ -423,7 +466,11 @@ class Content(DjangoDataModel): async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None: log.debug("Fetching image url") - response = await client.get(url, timeout=15) + try: + response = await client.get(url, timeout=15) + except httpx.HTTPError: + return None + soup = BeautifulSoup(response.text, "html.parser") image_element = soup.select_one("meta[property='og:image']") if not image_element: @@ -470,4 +517,6 @@ class Content(DjangoDataModel): ) embed.set_footer(text=self.subscription.name) + log.debug(f"created embed: {embed.to_dict()}") + return embed diff --git a/src/tests.py b/src/tests.py index 504ca34..41e596b 100644 --- a/src/tests.py +++ b/src/tests.py @@ -1,8 +1,9 @@ -import re - - def test_content_filters(): + """ + In this test, a content filter is created and used to filter some data. + """ + from models import ContentFilter, MatchingAlgorithm content_filter = ContentFilter( @@ -26,8 +27,110 @@ def test_content_filters(): print("success") +def test_content_duplicates(): + """ + In this test, two almost but not quite identical instances of `Content` are created, and + checked against each other as duplicates. + + They should be considered duplicate, because not all fields need to match in order to be considered a duplicate. + + The provided data is from a real world used example, where what should have been considered a duplicate was missed. + """ + + from models import Content + from datetime import datetime + + datetime_now = datetime.now() + + first_content = Content( + id=0, + subscription_id=38, + item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0", + item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0", + item_url="https://www.bbc.com/news/articles/ced9l7799w9o", + item_title="Spain's PM orders 10,000 troops and police to Valencia", + item_description="", + item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8", + item_image_url="", + item_thumbnail_url="", + item_published=datetime_now, + item_author="", + item_author_url="", + item_feed_title="", + item_feed_url="", + blocked=False, + ) + + second_content = Content( + id=1, + subscription_id=38, + item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0", + item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0", + item_url="https://www.bbc.com/news/articles/ced9l7799w9o", + item_title="Spain's PM orders 10,000 troops and police to flood-hit Valencia", + item_description="", + item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8", + item_image_url="", + item_thumbnail_url="", + item_published=datetime_now, + item_author="", + item_author_url="", + item_feed_title="", + item_feed_url="", + blocked=False, + ) + + assert first_content.is_duplicate(second_content), "Content is not considered a duplicate" + + print("1 success") + + # BUG: This one is identified but still gets processed... + + third_content = Content( + id=0, + subscription_id=38, + item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8", + item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8", + item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o", + item_title="Sao Paulo GP qualifying set for Sunday after rain postponement", + item_description="", + item_content_hash="b6c78de554a183cfeca88decf987401719d431647523f038a86fd7d972e4e799", + item_image_url="", + item_thumbnail_url="", + item_published=datetime_now, + item_author="", + item_author_url="", + item_feed_title="", + item_feed_url="", + blocked=False, + ) + + fourth_content = Content( + id=0, + subscription_id=38, + item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8", + item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8", + item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o", + item_title="Sao Paulo GP qualifying set for Sunday after rain postponement", + item_description="", + item_content_hash="6ddd15d7d9626f2d63ba5631056fda9bcaf920e8c82ec5c23fa824b02ce690d0", + item_image_url="", + item_thumbnail_url="", + item_published=datetime_now, + item_author="", + item_author_url="", + item_feed_title="", + item_feed_url="", + blocked=False, + ) + + assert third_content.is_duplicate(fourth_content) + + print("2 success") + def main(): - test_content_filters() + # test_content_filters() + test_content_duplicates() if __name__ == "__main__": main()