duplicate content enforcement

2024-11-03 22:37:52 +00:00 · 2024-11-03 22:37:52 +00:00 · 81795feb65
commit 81795feb65
parent 1f9075ce60
3 changed files with 204 additions and 55 deletions
--- a/src/extensions/tasks.py
+++ b/src/extensions/tasks.py
@ -109,7 +109,7 @@ class TaskCog(commands.Cog):
            await do_batch_job(servers, self.process_server, 10)

        end_time = perf_counter()
-        log.debug(f"completed task in {end_time - start_time:.4f} seconds")
+        log.info(f"completed task in {end_time - start_time:.4f} seconds")

    async def iterate_pages(self, url: str, params: dict={}):

@ -147,6 +147,28 @@ class TaskCog(commands.Cog):

        return models.Subscription.from_list(subscriptions)

+    async def get_contents(self, subscription: models.Subscription, raw_rss_content: dict):
+        contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
+        duplicate_contents = []
+
+        async def check_duplicate_content(content: models.Content):
+            exists = await content.exists_via_api(
+                url=self.api_base_url + "content/",
+                headers=self.api_headers,
+                client=self.client
+            )
+
+            if exists:
+                log.debug(f"Removing duplicate {content}")
+                duplicate_contents.append(content)
+
+        await do_batch_job(contents, check_duplicate_content, 15)
+
+        for duplicate in duplicate_contents:
+            contents.remove(duplicate)
+
+        return contents
+
    async def process_server(self, server: models.Server):
        log.debug(f"processing server: {server.name}")
        start_time = perf_counter()
@ -168,42 +190,17 @@ class TaskCog(commands.Cog):
        if not raw_rss_content:
            return

-        contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
-
-        async def check_duplicate_content(content: models.Content):
-            params = {
-                "match_any": True,  # allows any param to match, instead of needing all
-                "item_id": content.item_id,
-                "item_guid": content.item_guid,
-                "item_url": content.item_url,
-                "item_title": content.item_title,
-                "item_content_hash": content.item_content_hash,
-                "subscription": content.subscription_id
-            }
-
-            try:
-                response = await self.client.get(
-                    self.api_base_url + f"content/",
-                    headers=self.api_headers,
-                    params=params
-                )
-                response.raise_for_status()
-
-                if len(response.json().get("results", [])):
-                    log.debug("found duplicate")
-                    contents.remove(content)
-            except httpx.HTTPError as exc:
-                log.error(f"assuming not duplicate {exc}")
-
-        # clear duplicate content
-        log.debug(f"checking for duplicates (count: {len(contents)})")
-        await do_batch_job(contents, check_duplicate_content, 15)
-        log.debug(f"finished looking for duplicates (count: {len(contents)})")
+        contents = await self.get_contents(subscription, raw_rss_content)
+        if not contents:
+            log.debug("no contents to process")
+            return

        channels = await subscription.get_discord_channels(self.bot)
        valid_contents, invalid_contents = subscription.filter_entries(contents)

        async def send_content(channel: discord.TextChannel):
+            # BUG: I believe there are duplicate embeds here
+            # discord only shows 1 when urls are matching, but merges images from both into the 1
            embeds = [content.embed for content in valid_contents]
            batch_size = 10
            for i in range(0, len(embeds), batch_size):
--- a/src/models.py
+++ b/src/models.py
@ -217,18 +217,6 @@ class MessageStyle(DjangoDataModel):
        return item


-@dataclass(slots=True)
-class UniqueContentRule(DjangoDataModel):
-    id: int
-    name: str
-    value: str
-
-    @staticmethod
-    def parser(item: dict) -> dict:
-        item["id"] = int(item.pop("id"))
-        return item
-
-
@dataclass(slots=True)
 class DiscordChannel(DjangoDataModel):
    id: int
@ -255,7 +243,6 @@ class Subscription(DjangoDataModel):
    channels: list[DiscordChannel]
    filters: list[ContentFilter]
    message_style: MessageStyle
-    unique_rules: UniqueContentRule
    _server: Server | None = None

    @staticmethod
@ -268,7 +255,6 @@ class Subscription(DjangoDataModel):
        item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
        item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
        item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
-        item["unique_rules"] = UniqueContentRule.from_list(item.pop("unique_rules_detail"))
        return item

    @property
@ -358,6 +344,53 @@ class Content(DjangoDataModel):
        item["subscription_id"] = item.pop("subscription")
        return item

+    async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
+        log.debug(f"checking if {self.item_content_hash} exists via API")
+        params = {
+            "match_any": True,  # allows any param to match, instead of needing all
+            "item_id": self.item_id,
+            "item_guid": self.item_guid,
+            "item_url": self.item_url,
+            "item_title": self.item_title,
+            "item_content_hash": self.item_content_hash,
+            "subscription": self.subscription_id
+        }
+
+        try:
+            response = await client.get(
+                url=url,
+                headers=headers,
+                params=params
+            )
+            response.raise_for_status()
+        except httpx.HTTPError as exc:
+            log.error(f"assuming not duplicate due to error: {exc}")
+            return False
+
+        return response.json().get("results", [])
+
+    def is_duplicate(self, other):
+        if not isinstance(other, Content):
+            raise ValueError(f"Expected Content, received {type(other)}")
+
+        other_details = other.duplicate_details
+        return any(
+            other_details.get(key) == value
+            for key, value in self.duplicate_details.items()
+        )
+
+    @property
+    def duplicate_details(self):
+        keys = [
+            "item_id",
+            "item_guid",
+            "item_url",
+            "item_title",
+            "item_content_hash"
+        ]
+        data = asdict(self)
+        return { key: data[key] for key in keys }
+
    async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
        log.debug(f"saving content {self.item_content_hash}")

@ -373,8 +406,8 @@ class Content(DjangoDataModel):
            headers=headers,
            data=data
        )
-        log.debug(response.text)
        response.raise_for_status()
+        log.debug(f"save success for {self.item_content_hash}")

    @classmethod
    async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
@ -383,6 +416,13 @@ class Content(DjangoDataModel):
        contents = []

        async def create_content(entry: feedparser.FeedParserDict):
+            published = entry.get("published_parsed")
+            published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
+
+            if published < subscription.publish_threshold:
+                log.debug("skipping due to publish threshold")
+                return
+
            content_hash = hashlib.new("sha256")
            content_hash.update(entry.get("description", "").encode())

@ -391,9 +431,6 @@ class Content(DjangoDataModel):
            if style.fetch_images:
                item_image_url = await cls.get_image_url(item_url, client)

-            published = entry.get("published_parsed")
-            published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
-
            content = Content.from_dict({
                "id": -1,
                "subscription": subscription.id,
@ -412,6 +449,12 @@ class Content(DjangoDataModel):
                "item_feed_url": parsed_rss.get("feed", {}).get("link")
            })

+            # Weed out duplicates
+            log.debug("weeding out duplicates")
+            if any(content.is_duplicate(other) for other in contents):
+                log.debug("found duplicate while loading rss data")
+                return
+
            content.subscription = subscription
            contents.append(content)

@ -423,7 +466,11 @@ class Content(DjangoDataModel):
    async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
        log.debug("Fetching image url")

-        response = await client.get(url, timeout=15)
+        try:
+            response = await client.get(url, timeout=15)
+        except httpx.HTTPError:
+            return None
+
        soup = BeautifulSoup(response.text, "html.parser")
        image_element = soup.select_one("meta[property='og:image']")
        if not image_element:
@ -470,4 +517,6 @@ class Content(DjangoDataModel):
        )
        embed.set_footer(text=self.subscription.name)

+        log.debug(f"created embed: {embed.to_dict()}")
+
        return embed
--- a/src/tests.py
+++ b/src/tests.py
@ -1,8 +1,9 @@

-import re
-
-
 def test_content_filters():
+    """
+    In this test, a content filter is created and used to filter some data.
+    """
+
    from models import ContentFilter, MatchingAlgorithm

    content_filter = ContentFilter(
@ -26,8 +27,110 @@ def test_content_filters():
    print("success")


+def test_content_duplicates():
+    """
+    In this test, two almost but not quite identical instances of `Content` are created, and
+    checked against each other as duplicates.
+
+    They should be considered duplicate, because not all fields need to match in order to be considered a duplicate.
+
+    The provided data is from a real world used example, where what should have been considered a duplicate was missed.
+    """
+
+    from models import Content
+    from datetime import datetime
+
+    datetime_now = datetime.now()
+
+    first_content = Content(
+        id=0,
+        subscription_id=38,
+        item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
+        item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
+        item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
+        item_title="Spain's PM orders 10,000 troops and police to Valencia",
+        item_description="",
+        item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
+        item_image_url="",
+        item_thumbnail_url="",
+        item_published=datetime_now,
+        item_author="",
+        item_author_url="",
+        item_feed_title="",
+        item_feed_url="",
+        blocked=False,
+    )
+
+    second_content = Content(
+        id=1,
+        subscription_id=38,
+        item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
+        item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
+        item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
+        item_title="Spain's PM orders 10,000 troops and police to flood-hit Valencia",
+        item_description="",
+        item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
+        item_image_url="",
+        item_thumbnail_url="",
+        item_published=datetime_now,
+        item_author="",
+        item_author_url="",
+        item_feed_title="",
+        item_feed_url="",
+        blocked=False,
+    )
+
+    assert first_content.is_duplicate(second_content), "Content is not considered a duplicate"
+
+    print("1 success")
+
+    # BUG: This one is identified but still gets processed...
+
+    third_content = Content(
+        id=0,
+        subscription_id=38,
+        item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
+        item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
+        item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
+        item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
+        item_description="",
+        item_content_hash="b6c78de554a183cfeca88decf987401719d431647523f038a86fd7d972e4e799",
+        item_image_url="",
+        item_thumbnail_url="",
+        item_published=datetime_now,
+        item_author="",
+        item_author_url="",
+        item_feed_title="",
+        item_feed_url="",
+        blocked=False,
+    )
+
+    fourth_content = Content(
+        id=0,
+        subscription_id=38,
+        item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
+        item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
+        item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
+        item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
+        item_description="",
+        item_content_hash="6ddd15d7d9626f2d63ba5631056fda9bcaf920e8c82ec5c23fa824b02ce690d0",
+        item_image_url="",
+        item_thumbnail_url="",
+        item_published=datetime_now,
+        item_author="",
+        item_author_url="",
+        item_feed_title="",
+        item_feed_url="",
+        blocked=False,
+    )
+
+    assert third_content.is_duplicate(fourth_content)
+
+    print("2 success")
+
 def main():
-    test_content_filters()
+    # test_content_filters()
+    test_content_duplicates()

 if __name__ == "__main__":
    main()