duplicate content enforcement
Some checks failed
Build and Push Docker Image / build (push) Failing after 7m4s

This commit is contained in:
Corban-Lee Jones 2024-11-03 22:37:52 +00:00
parent 1f9075ce60
commit 81795feb65
3 changed files with 204 additions and 55 deletions

View File

@ -109,7 +109,7 @@ class TaskCog(commands.Cog):
await do_batch_job(servers, self.process_server, 10)
end_time = perf_counter()
log.debug(f"completed task in {end_time - start_time:.4f} seconds")
log.info(f"completed task in {end_time - start_time:.4f} seconds")
async def iterate_pages(self, url: str, params: dict={}):
@ -147,6 +147,28 @@ class TaskCog(commands.Cog):
return models.Subscription.from_list(subscriptions)
async def get_contents(self, subscription: models.Subscription, raw_rss_content: dict):
contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
duplicate_contents = []
async def check_duplicate_content(content: models.Content):
exists = await content.exists_via_api(
url=self.api_base_url + "content/",
headers=self.api_headers,
client=self.client
)
if exists:
log.debug(f"Removing duplicate {content}")
duplicate_contents.append(content)
await do_batch_job(contents, check_duplicate_content, 15)
for duplicate in duplicate_contents:
contents.remove(duplicate)
return contents
async def process_server(self, server: models.Server):
log.debug(f"processing server: {server.name}")
start_time = perf_counter()
@ -168,42 +190,17 @@ class TaskCog(commands.Cog):
if not raw_rss_content:
return
contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
async def check_duplicate_content(content: models.Content):
params = {
"match_any": True, # allows any param to match, instead of needing all
"item_id": content.item_id,
"item_guid": content.item_guid,
"item_url": content.item_url,
"item_title": content.item_title,
"item_content_hash": content.item_content_hash,
"subscription": content.subscription_id
}
try:
response = await self.client.get(
self.api_base_url + f"content/",
headers=self.api_headers,
params=params
)
response.raise_for_status()
if len(response.json().get("results", [])):
log.debug("found duplicate")
contents.remove(content)
except httpx.HTTPError as exc:
log.error(f"assuming not duplicate {exc}")
# clear duplicate content
log.debug(f"checking for duplicates (count: {len(contents)})")
await do_batch_job(contents, check_duplicate_content, 15)
log.debug(f"finished looking for duplicates (count: {len(contents)})")
contents = await self.get_contents(subscription, raw_rss_content)
if not contents:
log.debug("no contents to process")
return
channels = await subscription.get_discord_channels(self.bot)
valid_contents, invalid_contents = subscription.filter_entries(contents)
async def send_content(channel: discord.TextChannel):
# BUG: I believe there are duplicate embeds here
# discord only shows 1 when urls are matching, but merges images from both into the 1
embeds = [content.embed for content in valid_contents]
batch_size = 10
for i in range(0, len(embeds), batch_size):

View File

@ -217,18 +217,6 @@ class MessageStyle(DjangoDataModel):
return item
@dataclass(slots=True)
class UniqueContentRule(DjangoDataModel):
id: int
name: str
value: str
@staticmethod
def parser(item: dict) -> dict:
item["id"] = int(item.pop("id"))
return item
@dataclass(slots=True)
class DiscordChannel(DjangoDataModel):
id: int
@ -255,7 +243,6 @@ class Subscription(DjangoDataModel):
channels: list[DiscordChannel]
filters: list[ContentFilter]
message_style: MessageStyle
unique_rules: UniqueContentRule
_server: Server | None = None
@staticmethod
@ -268,7 +255,6 @@ class Subscription(DjangoDataModel):
item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
item["unique_rules"] = UniqueContentRule.from_list(item.pop("unique_rules_detail"))
return item
@property
@ -358,6 +344,53 @@ class Content(DjangoDataModel):
item["subscription_id"] = item.pop("subscription")
return item
async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
log.debug(f"checking if {self.item_content_hash} exists via API")
params = {
"match_any": True, # allows any param to match, instead of needing all
"item_id": self.item_id,
"item_guid": self.item_guid,
"item_url": self.item_url,
"item_title": self.item_title,
"item_content_hash": self.item_content_hash,
"subscription": self.subscription_id
}
try:
response = await client.get(
url=url,
headers=headers,
params=params
)
response.raise_for_status()
except httpx.HTTPError as exc:
log.error(f"assuming not duplicate due to error: {exc}")
return False
return response.json().get("results", [])
def is_duplicate(self, other):
if not isinstance(other, Content):
raise ValueError(f"Expected Content, received {type(other)}")
other_details = other.duplicate_details
return any(
other_details.get(key) == value
for key, value in self.duplicate_details.items()
)
@property
def duplicate_details(self):
keys = [
"item_id",
"item_guid",
"item_url",
"item_title",
"item_content_hash"
]
data = asdict(self)
return { key: data[key] for key in keys }
async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
log.debug(f"saving content {self.item_content_hash}")
@ -373,8 +406,8 @@ class Content(DjangoDataModel):
headers=headers,
data=data
)
log.debug(response.text)
response.raise_for_status()
log.debug(f"save success for {self.item_content_hash}")
@classmethod
async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
@ -383,6 +416,13 @@ class Content(DjangoDataModel):
contents = []
async def create_content(entry: feedparser.FeedParserDict):
published = entry.get("published_parsed")
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
if published < subscription.publish_threshold:
log.debug("skipping due to publish threshold")
return
content_hash = hashlib.new("sha256")
content_hash.update(entry.get("description", "").encode())
@ -391,9 +431,6 @@ class Content(DjangoDataModel):
if style.fetch_images:
item_image_url = await cls.get_image_url(item_url, client)
published = entry.get("published_parsed")
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
content = Content.from_dict({
"id": -1,
"subscription": subscription.id,
@ -412,6 +449,12 @@ class Content(DjangoDataModel):
"item_feed_url": parsed_rss.get("feed", {}).get("link")
})
# Weed out duplicates
log.debug("weeding out duplicates")
if any(content.is_duplicate(other) for other in contents):
log.debug("found duplicate while loading rss data")
return
content.subscription = subscription
contents.append(content)
@ -423,7 +466,11 @@ class Content(DjangoDataModel):
async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
log.debug("Fetching image url")
response = await client.get(url, timeout=15)
try:
response = await client.get(url, timeout=15)
except httpx.HTTPError:
return None
soup = BeautifulSoup(response.text, "html.parser")
image_element = soup.select_one("meta[property='og:image']")
if not image_element:
@ -470,4 +517,6 @@ class Content(DjangoDataModel):
)
embed.set_footer(text=self.subscription.name)
log.debug(f"created embed: {embed.to_dict()}")
return embed

View File

@ -1,8 +1,9 @@
import re
def test_content_filters():
"""
In this test, a content filter is created and used to filter some data.
"""
from models import ContentFilter, MatchingAlgorithm
content_filter = ContentFilter(
@ -26,8 +27,110 @@ def test_content_filters():
print("success")
def test_content_duplicates():
"""
In this test, two almost but not quite identical instances of `Content` are created, and
checked against each other as duplicates.
They should be considered duplicate, because not all fields need to match in order to be considered a duplicate.
The provided data is from a real world used example, where what should have been considered a duplicate was missed.
"""
from models import Content
from datetime import datetime
datetime_now = datetime.now()
first_content = Content(
id=0,
subscription_id=38,
item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
item_title="Spain's PM orders 10,000 troops and police to Valencia",
item_description="",
item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
item_image_url="",
item_thumbnail_url="",
item_published=datetime_now,
item_author="",
item_author_url="",
item_feed_title="",
item_feed_url="",
blocked=False,
)
second_content = Content(
id=1,
subscription_id=38,
item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
item_title="Spain's PM orders 10,000 troops and police to flood-hit Valencia",
item_description="",
item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
item_image_url="",
item_thumbnail_url="",
item_published=datetime_now,
item_author="",
item_author_url="",
item_feed_title="",
item_feed_url="",
blocked=False,
)
assert first_content.is_duplicate(second_content), "Content is not considered a duplicate"
print("1 success")
# BUG: This one is identified but still gets processed...
third_content = Content(
id=0,
subscription_id=38,
item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
item_description="",
item_content_hash="b6c78de554a183cfeca88decf987401719d431647523f038a86fd7d972e4e799",
item_image_url="",
item_thumbnail_url="",
item_published=datetime_now,
item_author="",
item_author_url="",
item_feed_title="",
item_feed_url="",
blocked=False,
)
fourth_content = Content(
id=0,
subscription_id=38,
item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
item_description="",
item_content_hash="6ddd15d7d9626f2d63ba5631056fda9bcaf920e8c82ec5c23fa824b02ce690d0",
item_image_url="",
item_thumbnail_url="",
item_published=datetime_now,
item_author="",
item_author_url="",
item_feed_title="",
item_feed_url="",
blocked=False,
)
assert third_content.is_duplicate(fourth_content)
print("2 success")
def main():
test_content_filters()
# test_content_filters()
test_content_duplicates()
if __name__ == "__main__":
main()